Buckets:

HuggingFaceDocBuilder's picture
download
raw
50.7 kB
import{s as It,o as Bt,n as Et}from"../chunks/scheduler.53228c21.js";import{S as Zt,i as Nt,e as r,s as o,c as f,h as Vt,a as c,d as n,b as s,f as J,g,j as p,k as S,l as a,m as i,n as h,t as _,o as y,p as v}from"../chunks/index.cac5d66a.js";import{C as Qt}from"../chunks/CopyLLMTxtMenu.4912207d.js";import{D as oe}from"../chunks/Docstring.1e7ac4f3.js";import{C as Gt}from"../chunks/CodeBlock.606cbaf4.js";import{E as zt}from"../chunks/ExampleCodeBlock.ccf7d2a9.js";import{H as De,E as Dt}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.323ee77a.js";function Lt(Me){let m,P="Examples:",T,b,M;return b=new Gt({props:{code:"aW1wb3J0JTIwdG9yY2glMEFpbXBvcnQlMjBzb3VuZGZpbGUlMjBhcyUyMHNmJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMEFjZVN0ZXBQaXBlbGluZSUwQSUwQXBpcGUlMjAlM0QlMjBBY2VTdGVwUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMkFDRS1TdGVwJTJGYWNlc3RlcC12MTUteGwtdHVyYm8tZGlmZnVzZXJzJTIyJTJDJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiUwQSklMEFwaXBlJTIwJTNEJTIwcGlwZS50byglMjJjdWRhJTIyKSUwQSUwQSUyMyUyMFRleHQtdG8tbXVzaWMlMjBnZW5lcmF0aW9uJTIwd2l0aCUyMG1ldGFkYXRhJTBBYXVkaW8lMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMHByb21wdCUzRCUyMkElMjBiZWF1dGlmdWwlMjBwaWFubyUyMHBpZWNlJTIwd2l0aCUyMHNvZnQlMjBtZWxvZGllcyUyMiUyQyUwQSUyMCUyMCUyMCUyMGx5cmljcyUzRCUyMiU1QnZlcnNlJTVEJTVDblNvZnQlMjBub3RlcyUyMGluJTIwdGhlJTIwbW9ybmluZyUyMGxpZ2h0JTVDbiU1QmNob3J1cyU1RCU1Q25NdXNpYyUyMGZpbGxzJTIwdGhlJTIwYWlyJTIwdG9uaWdodCUyMiUyQyUwQSUyMCUyMCUyMCUyMGF1ZGlvX2R1cmF0aW9uJTNEMzAuMCUyQyUwQSUyMCUyMCUyMCUyMG51bV9pbmZlcmVuY2Vfc3RlcHMlM0Q4JTJDJTBBJTIwJTIwJTIwJTIwYnBtJTNEMTIwJTJDJTBBJTIwJTIwJTIwJTIwa2V5c2NhbGUlM0QlMjJDJTIwbWFqb3IlMjIlMkMlMEElMjAlMjAlMjAlMjB0aW1lc2lnbmF0dXJlJTNEJTIyNCUyMiUyQyUwQSkuYXVkaW9zJTBBJTBBJTIzJTIwU2F2ZSUyMHRoZSUyMGdlbmVyYXRlZCUyMGF1ZGlvJTBBc2Yud3JpdGUoJTIyb3V0cHV0LndhdiUyMiUyQyUyMGF1ZGlvJTVCMCUyQyUyMDAlNUQuY3B1KCkubnVtcHkoKSUyQyUyMDQ4MDAwKSUwQSUwQSUyMyUyMFJlcGFpbnQlMjB0YXNrJTNBJTIwcmVnZW5lcmF0ZSUyMGElMjBzZWN0aW9uJTIwb2YlMjBleGlzdGluZyUyMHN0ZXJlbyUyMDQ4a0h6JTIwYXVkaW8lMEFzcmNfYXVkaW8lMkMlMjBzciUyMCUzRCUyMHNmLnJlYWQoJTIyaW5wdXQud2F2JTIyKSUwQXNyY19hdWRpbyUyMCUzRCUyMHRvcmNoLmZyb21fbnVtcHkoc3JjX2F1ZGlvKS5mbG9hdCgpLlQlMEFhdWRpbyUyMCUzRCUyMHBpcGUoJTBBJTIwJTIwJTIwJTIwcHJvbXB0JTNEJTIyRXBpYyUyMHJvY2slMjBndWl0YXIlMjBzb2xvJTIyJTJDJTBBJTIwJTIwJTIwJTIwbHlyaWNzJTNEJTIyJTIyJTJDJTBBJTIwJTIwJTIwJTIwdGFza190eXBlJTNEJTIycmVwYWludCUyMiUyQyUwQSUyMCUyMCUyMCUyMHNyY19hdWRpbyUzRHNyY19hdWRpbyUyQyUwQSUyMCUyMCUyMCUyMHJlcGFpbnRpbmdfc3RhcnQlM0QxMC4wJTJDJTBBJTIwJTIwJTIwJTIwcmVwYWludGluZ19lbmQlM0QyMC4wJTJDJTBBKS5hdWRpb3MlMEElMEElMjMlMjBDb3ZlciUyMHRhc2slMjB3aXRoJTIwcmVmZXJlbmNlJTIwYXVkaW8lMjBmb3IlMjB0aW1icmUlMjB0cmFuc2ZlciUwQXJlZl9hdWRpbyUyQyUyMHNyJTIwJTNEJTIwc2YucmVhZCglMjJyZWZlcmVuY2Uud2F2JTIyKSUwQXJlZl9hdWRpbyUyMCUzRCUyMHRvcmNoLmZyb21fbnVtcHkocmVmX2F1ZGlvKS5mbG9hdCgpLlQlMEFhdWRpbyUyMCUzRCUyMHBpcGUoJTBBJTIwJTIwJTIwJTIwcHJvbXB0JTNEJTIyUG9wJTIwc29uZyUyMHdpdGglMjBicmlnaHQlMjB2b2NhbHMlMjIlMkMlMEElMjAlMjAlMjAlMjBseXJpY3MlM0QlMjIlNUJ2ZXJzZSU1RCU1Q25IZWxsbyUyMHdvcmxkJTIyJTJDJTBBJTIwJTIwJTIwJTIwdGFza190eXBlJTNEJTIyY292ZXIlMjIlMkMlMEElMjAlMjAlMjAlMjByZWZlcmVuY2VfYXVkaW8lM0RyZWZfYXVkaW8lMkMlMEElMjAlMjAlMjAlMjBhdWRpb19jb3Zlcl9zdHJlbmd0aCUzRDAuOCUyQyUwQSkuYXVkaW9z",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> soundfile <span class="hljs-keyword">as</span> sf
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AceStepPipeline
<span class="hljs-meta">&gt;&gt;&gt; </span>pipe = AceStepPipeline.from_pretrained(
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;ACE-Step/acestep-v15-xl-turbo-diffusers&quot;</span>, torch_dtype=torch.bfloat16
<span class="hljs-meta">... </span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>pipe = pipe.to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># Text-to-music generation with metadata</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>audio = pipe(
<span class="hljs-meta">... </span> prompt=<span class="hljs-string">&quot;A beautiful piano piece with soft melodies&quot;</span>,
<span class="hljs-meta">... </span> lyrics=<span class="hljs-string">&quot;[verse]\\nSoft notes in the morning light\\n[chorus]\\nMusic fills the air tonight&quot;</span>,
<span class="hljs-meta">... </span> audio_duration=<span class="hljs-number">30.0</span>,
<span class="hljs-meta">... </span> num_inference_steps=<span class="hljs-number">8</span>,
<span class="hljs-meta">... </span> bpm=<span class="hljs-number">120</span>,
<span class="hljs-meta">... </span> keyscale=<span class="hljs-string">&quot;C major&quot;</span>,
<span class="hljs-meta">... </span> timesignature=<span class="hljs-string">&quot;4&quot;</span>,
<span class="hljs-meta">... </span>).audios
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># Save the generated audio</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>sf.write(<span class="hljs-string">&quot;output.wav&quot;</span>, audio[<span class="hljs-number">0</span>, <span class="hljs-number">0</span>].cpu().numpy(), <span class="hljs-number">48000</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># Repaint task: regenerate a section of existing stereo 48kHz audio</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>src_audio, sr = sf.read(<span class="hljs-string">&quot;input.wav&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>src_audio = torch.from_numpy(src_audio).<span class="hljs-built_in">float</span>().T
<span class="hljs-meta">&gt;&gt;&gt; </span>audio = pipe(
<span class="hljs-meta">... </span> prompt=<span class="hljs-string">&quot;Epic rock guitar solo&quot;</span>,
<span class="hljs-meta">... </span> lyrics=<span class="hljs-string">&quot;&quot;</span>,
<span class="hljs-meta">... </span> task_type=<span class="hljs-string">&quot;repaint&quot;</span>,
<span class="hljs-meta">... </span> src_audio=src_audio,
<span class="hljs-meta">... </span> repainting_start=<span class="hljs-number">10.0</span>,
<span class="hljs-meta">... </span> repainting_end=<span class="hljs-number">20.0</span>,
<span class="hljs-meta">... </span>).audios
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># Cover task with reference audio for timbre transfer</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>ref_audio, sr = sf.read(<span class="hljs-string">&quot;reference.wav&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>ref_audio = torch.from_numpy(ref_audio).<span class="hljs-built_in">float</span>().T
<span class="hljs-meta">&gt;&gt;&gt; </span>audio = pipe(
<span class="hljs-meta">... </span> prompt=<span class="hljs-string">&quot;Pop song with bright vocals&quot;</span>,
<span class="hljs-meta">... </span> lyrics=<span class="hljs-string">&quot;[verse]\\nHello world&quot;</span>,
<span class="hljs-meta">... </span> task_type=<span class="hljs-string">&quot;cover&quot;</span>,
<span class="hljs-meta">... </span> reference_audio=ref_audio,
<span class="hljs-meta">... </span> audio_cover_strength=<span class="hljs-number">0.8</span>,
<span class="hljs-meta">... </span>).audios`,lang:"py",wrap:!1}}),{c(){m=r("p"),m.textContent=P,T=o(),f(b.$$.fragment)},l(u){m=c(u,"P",{"data-svelte-h":!0}),p(m)!=="svelte-kvfsh7"&&(m.textContent=P),T=s(u),g(b.$$.fragment,u)},m(u,x){i(u,m,x),i(u,T,x),h(b,u,x),M=!0},p:Et,i(u){M||(_(b.$$.fragment,u),M=!0)},o(u){y(b.$$.fragment,u),M=!1},d(u){u&&(n(m),n(T)),v(b,u)}}}function Ht(Me){let m,P,T,b,M,u,x,xe,q,pt='ACE-Step 1.5 was introduced in <a href="https://arxiv.org/abs/2602.00744" rel="nofollow">ACE-Step 1.5: Pushing the Boundaries of Open-Source Music Generation</a> by the ACE-Step Team (ACE Studio and StepFun). It is an open-source music foundation model that generates commercial-grade stereo music with lyrics from text prompts.',Te,G,ut='ACE-Step 1.5 generates variable-length stereo audio at 48 kHz (10 seconds to 10 minutes) from text prompts and optional lyrics. The full system pairs a Language Model planner with a Diffusion Transformer (DiT) synthesizer; this pipeline wraps the DiT half of that stack, and consists of three components: an <a href="/docs/diffusers/pr_13745/en/api/models/autoencoder_oobleck#diffusers.AutoencoderOobleck">AutoencoderOobleck</a> VAE that compresses waveforms into 25 Hz stereo latents, a Qwen3-based text encoder for prompt and lyric conditioning, and an <a href="/docs/diffusers/pr_13745/en/api/models/ace_step_transformer#diffusers.AceStepTransformer1DModel">AceStepTransformer1DModel</a> DiT that operates in the VAE latent space using flow matching.',we,I,mt="The model supports 50+ languages for lyrics — including English, Chinese, Japanese, Korean, French, German, Spanish, Italian, Portuguese, and Russian — and runs on consumer GPUs (under 4 GB of VRAM when offloaded).",Ue,B,ft='This pipeline was contributed by the <a href="https://github.com/ace-step" rel="nofollow">ACE-Step Team</a>. The original codebase can be found at <a href="https://github.com/ace-step/ACE-Step-1.5" rel="nofollow">ace-step/ACE-Step-1.5</a>.',Ce,E,Je,Z,gt="ACE-Step 1.5 ships three DiT checkpoints that share the same transformer architecture but differ in guidance behavior; the pipeline auto-detects turbo checkpoints from the loaded transformer config and ignores CFG guidance for those guidance-distilled weights.",Se,N,ht='<thead><tr><th>Variant</th> <th align="center">CFG</th> <th align="center">Default steps</th> <th align="center">Default <code>guidance_scale</code></th> <th align="center">Default <code>shift</code></th> <th>HF repo</th></tr></thead> <tbody><tr><td><code>turbo</code> (guidance-distilled)</td> <td align="center">off</td> <td align="center">8</td> <td align="center">ignored</td> <td align="center">3.0</td> <td><a href="https://huggingface.co/ACE-Step/acestep-v15-xl-turbo-diffusers" rel="nofollow"><code>ACE-Step/acestep-v15-xl-turbo-diffusers</code></a></td></tr> <tr><td><code>base</code></td> <td align="center">on</td> <td align="center">8</td> <td align="center">7.0</td> <td align="center">3.0</td> <td><a href="https://huggingface.co/ACE-Step/acestep-v15-base" rel="nofollow"><code>ACE-Step/acestep-v15-base</code></a></td></tr> <tr><td><code>sft</code></td> <td align="center">on</td> <td align="center">8</td> <td align="center">7.0</td> <td align="center">3.0</td> <td><a href="https://huggingface.co/ACE-Step/acestep-v15-sft" rel="nofollow"><code>ACE-Step/acestep-v15-sft</code></a></td></tr></tbody>',ke,V,_t="Base and SFT use the learned <code>null_condition_emb</code> for classifier-free guidance (APG, not vanilla CFG). Users commonly override <code>num_inference_steps</code> to 30–60 on base/sft for higher quality.",je,Q,Ae,z,yt="When constructing a prompt, keep in mind:",$e,D,vt="<li>Descriptive prompt inputs work best; use adjectives to describe the music style, instruments, mood, and tempo.</li> <li>The prompt should describe the overall musical characteristics (e.g., “upbeat pop song with electric guitar and drums”).</li> <li>Lyrics should be structured with tags like <code>[verse]</code>, <code>[chorus]</code>, <code>[bridge]</code>, etc.</li>",Pe,L,bt="During inference:",qe,H,Mt="<li><code>num_inference_steps</code>, <code>guidance_scale</code>, and <code>shift</code> default to the values shown above. For turbo checkpoints, <code>guidance_scale &gt; 1.0</code> is ignored with a warning because guidance is distilled into the weights.</li> <li>The <code>audio_duration</code> parameter controls the length of the generated music in seconds.</li> <li>The <code>vocal_language</code> parameter should match the language of the lyrics.</li> <li><code>pipe.sample_rate</code> and <code>pipe.latents_per_second</code> are sourced from the VAE config (48000 Hz and 25 fps for the released checkpoints).</li> <li>For audio-to-audio tasks, pass <code>src_audio</code> and <code>reference_audio</code> as preprocessed stereo tensors at <code>pipe.sample_rate</code>.</li> <li><code>flash</code> and <code>flash_hub</code> use FlashAttention’s native sliding-window support for ACE-Step’s self-attention and expect unpadded text batches. If a batched prompt contains padding, use <code>flash_varlen</code> or <code>flash_varlen_hub</code> instead. Single-prompt inference with <code>padding=&quot;longest&quot;</code> is normally unpadded.</li>",Ge,W,Ie,R,Be,l,F,Le,se,xt="Pipeline for text-to-music generation using ACE-Step 1.5.",He,ae,Tt=`This model inherits from <a href="/docs/diffusers/pr_13745/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a>. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).`,We,ie,wt=`The pipeline uses flow matching with a custom timestep schedule for the diffusion process. The turbo model variant
uses 8 inference steps by default.`,Re,re,Ut="Supported task types:",Fe,ce,Ct="<li><code>&quot;text2music&quot;</code>: Generate music from text prompts and lyrics.</li> <li><code>&quot;cover&quot;</code>: Generate audio from source audio / semantic codes with timbre transfer from reference audio.</li> <li><code>&quot;repaint&quot;</code>: Regenerate a section of existing audio while keeping the rest.</li> <li><code>&quot;extract&quot;</code>: Extract a specific track (e.g., vocals, drums) from audio.</li> <li><code>&quot;lego&quot;</code>: Generate a specific track based on audio context.</li> <li><code>&quot;complete&quot;</code>: Complete an input audio with additional tracks.</li>",Xe,w,X,Ye,le,Jt="The call function to the pipeline for music generation.",Oe,k,Ke,j,Y,et,de,St="Validate user-facing arguments before we start allocating noise tensors.",tt,U,O,nt,pe,kt="Encode text prompts and lyrics into embeddings.",ot,ue,jt=`Text prompts are encoded through the full text encoder model to produce contextual hidden states. Lyrics are
only passed through the text encoder’s embedding layer (token lookup), since the lyric encoder in the condition
encoder handles the contextual encoding.`,st,A,K,at,me,At="Prepare initial noise latents for the flow matching process.",it,C,ee,rt,fe,$t="Process reference audio into acoustic latents for the timbre encoder.",ct,ge,Pt=`The reference audio is repeated/cropped to 30 seconds (3 segments of 10 seconds each from front, middle, and
back), encoded through the VAE, and then transposed for the timbre encoder.`,lt,$,te,dt,he,qt="Prepare source latents for text-to-music and audio-to-audio tasks.",Ee,ne,Ze,be,Ne;return M=new Qt({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),x=new De({props:{title:"ACE-Step 1.5",local:"ace-step-15",headingTag:"h1"}}),E=new De({props:{title:"Variants",local:"variants",headingTag:"h2"}}),Q=new De({props:{title:"Tips",local:"tips",headingTag:"h2"}}),W=new Gt({props:{code:"aW1wb3J0JTIwdG9yY2glMEFpbXBvcnQlMjBzb3VuZGZpbGUlMjBhcyUyMHNmJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMEFjZVN0ZXBQaXBlbGluZSUwQSUwQXBpcGUlMjAlM0QlMjBBY2VTdGVwUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUyMkFDRS1TdGVwJTJGYWNlc3RlcC12MTUteGwtdHVyYm8tZGlmZnVzZXJzJTIyJTJDJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiklMEFwaXBlJTIwJTNEJTIwcGlwZS50byglMjJjdWRhJTIyKSUwQSUwQWF1ZGlvJTIwJTNEJTIwcGlwZSglMEElMjAlMjAlMjAlMjBwcm9tcHQlM0QlMjJBJTIwYmVhdXRpZnVsJTIwcGlhbm8lMjBwaWVjZSUyMHdpdGglMjBzb2Z0JTIwbWVsb2RpZXMlMjBhbmQlMjBnZW50bGUlMjByaHl0aG0lMjIlMkMlMEElMjAlMjAlMjAlMjBseXJpY3MlM0QlMjIlNUJ2ZXJzZSU1RCU1Q25Tb2Z0JTIwbm90ZXMlMjBpbiUyMHRoZSUyMG1vcm5pbmclMjBsaWdodCU1Q25EYW5jaW5nJTIwdGhyb3VnaCUyMHRoZSUyMGFpciUyMHNvJTIwYnJpZ2h0JTVDbiU1QmNob3J1cyU1RCU1Q25NdXNpYyUyMGZpbGxzJTIwdGhlJTIwYWlyJTIwdG9uaWdodCU1Q25FdmVyeSUyMG5vdGUlMjBmZWVscyUyMGp1c3QlMjByaWdodCUyMiUyQyUwQSUyMCUyMCUyMCUyMGF1ZGlvX2R1cmF0aW9uJTNEMzAuMCUyQyUwQSkuYXVkaW9zJTBBJTBBc2Yud3JpdGUoJTIyb3V0cHV0LndhdiUyMiUyQyUyMGF1ZGlvJTVCMCU1RC5ULmNwdSgpLmZsb2F0KCkubnVtcHkoKSUyQyUyMHBpcGUuc2FtcGxlX3JhdGUp",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">import</span> soundfile <span class="hljs-keyword">as</span> sf
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AceStepPipeline
pipe = AceStepPipeline.from_pretrained(<span class="hljs-string">&quot;ACE-Step/acestep-v15-xl-turbo-diffusers&quot;</span>, torch_dtype=torch.bfloat16)
pipe = pipe.to(<span class="hljs-string">&quot;cuda&quot;</span>)
audio = pipe(
prompt=<span class="hljs-string">&quot;A beautiful piano piece with soft melodies and gentle rhythm&quot;</span>,
lyrics=<span class="hljs-string">&quot;[verse]\\nSoft notes in the morning light\\nDancing through the air so bright\\n[chorus]\\nMusic fills the air tonight\\nEvery note feels just right&quot;</span>,
audio_duration=<span class="hljs-number">30.0</span>,
).audios
sf.write(<span class="hljs-string">&quot;output.wav&quot;</span>, audio[<span class="hljs-number">0</span>].T.cpu().<span class="hljs-built_in">float</span>().numpy(), pipe.sample_rate)`,lang:"python",wrap:!1}}),R=new De({props:{title:"AceStepPipeline",local:"diffusers.AceStepPipeline",headingTag:"h2"}}),F=new oe({props:{name:"class diffusers.AceStepPipeline",anchor:"diffusers.AceStepPipeline",parameters:[{name:"vae",val:": AutoencoderOobleck"},{name:"text_encoder",val:": PreTrainedModel"},{name:"tokenizer",val:": TokenizersBackend"},{name:"transformer",val:": AceStepTransformer1DModel"},{name:"condition_encoder",val:": AceStepConditionEncoder"},{name:"scheduler",val:": FlowMatchEulerDiscreteScheduler"},{name:"audio_tokenizer",val:": typing.Optional[diffusers.pipelines.ace_step.modeling_ace_step.AceStepAudioTokenizer] = None"},{name:"audio_token_detokenizer",val:": typing.Optional[diffusers.pipelines.ace_step.modeling_ace_step.AceStepAudioTokenDetokenizer] = None"}],parametersDescription:[{anchor:"diffusers.AceStepPipeline.vae",description:`<strong>vae</strong> (<a href="/docs/diffusers/pr_13745/en/api/models/autoencoder_oobleck#diffusers.AutoencoderOobleck">AutoencoderOobleck</a>) &#x2014;
Variational Auto-Encoder (VAE) model to encode and decode audio waveforms to and from latent
representations.`,name:"vae"},{anchor:"diffusers.AceStepPipeline.text_encoder",description:`<strong>text_encoder</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModel" rel="nofollow">AutoModel</a>) &#x2014;
Text encoder model (e.g., Qwen3-Embedding-0.6B) for encoding text prompts and lyrics.`,name:"text_encoder"},{anchor:"diffusers.AceStepPipeline.tokenizer",description:`<strong>tokenizer</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoTokenizer" rel="nofollow">AutoTokenizer</a>) &#x2014;
Tokenizer for the text encoder.`,name:"tokenizer"},{anchor:"diffusers.AceStepPipeline.transformer",description:`<strong>transformer</strong> (<a href="/docs/diffusers/pr_13745/en/api/models/ace_step_transformer#diffusers.AceStepTransformer1DModel">AceStepTransformer1DModel</a>) &#x2014;
The Diffusion Transformer (DiT) model for denoising audio latents.`,name:"transformer"},{anchor:"diffusers.AceStepPipeline.condition_encoder",description:`<strong>condition_encoder</strong> (<code>AceStepConditionEncoder</code>) &#x2014;
Condition encoder that combines text, lyric, and timbre embeddings for cross-attention.`,name:"condition_encoder"},{anchor:"diffusers.AceStepPipeline.scheduler",description:`<strong>scheduler</strong> (<a href="/docs/diffusers/pr_13745/en/api/schedulers/flow_match_euler_discrete#diffusers.FlowMatchEulerDiscreteScheduler">FlowMatchEulerDiscreteScheduler</a>) &#x2014;
Flow-matching Euler scheduler. ACE-Step feeds the DiT timesteps in <code>[0, 1]</code>, so the scheduler is configured
with <code>num_train_timesteps=1</code> and <code>shift=1.0</code> &#x2014; the pipeline computes its shifted / turbo sigma schedule
itself and passes it via <code>set_timesteps(sigmas=...)</code>.`,name:"scheduler"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L132"}}),X=new oe({props:{name:"__call__",anchor:"diffusers.AceStepPipeline.__call__",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]] = None"},{name:"lyrics",val:": typing.Union[str, typing.List[str]] = ''"},{name:"audio_duration",val:": float = 60.0"},{name:"vocal_language",val:": typing.Union[str, typing.List[str]] = 'en'"},{name:"num_inference_steps",val:": int = 8"},{name:"guidance_scale",val:": float = 7.0"},{name:"shift",val:": float = 3.0"},{name:"generator",val:": typing.Union[torch._C.Generator, typing.List[torch._C.Generator], NoneType] = None"},{name:"latents",val:": typing.Optional[torch.Tensor] = None"},{name:"output_type",val:": typing.Optional[str] = 'pt'"},{name:"return_dict",val:": bool = True"},{name:"callback",val:": typing.Optional[typing.Callable[[int, int, torch.Tensor], NoneType]] = None"},{name:"callback_steps",val:": typing.Optional[int] = 1"},{name:"callback_on_step_end",val:": typing.Optional[typing.Callable[..., dict]] = None"},{name:"callback_on_step_end_tensor_inputs",val:": typing.List[str] = ('latents',)"},{name:"instruction",val:": typing.Optional[str] = None"},{name:"max_text_length",val:": int = 256"},{name:"max_lyric_length",val:": int = 2048"},{name:"bpm",val:": typing.Optional[int] = None"},{name:"keyscale",val:": typing.Optional[str] = None"},{name:"timesignature",val:": typing.Optional[str] = None"},{name:"task_type",val:": str = 'text2music'"},{name:"track_name",val:": typing.Optional[str] = None"},{name:"complete_track_classes",val:": typing.Optional[typing.List[str]] = None"},{name:"src_audio",val:": typing.Optional[torch.Tensor] = None"},{name:"reference_audio",val:": typing.Optional[torch.Tensor] = None"},{name:"audio_codes",val:": typing.Union[str, typing.List[str], NoneType] = None"},{name:"repainting_start",val:": typing.Optional[float] = None"},{name:"repainting_end",val:": typing.Optional[float] = None"},{name:"audio_cover_strength",val:": float = 1.0"},{name:"cfg_interval_start",val:": float = 0.0"},{name:"cfg_interval_end",val:": float = 1.0"},{name:"timesteps",val:": typing.Optional[typing.List[float]] = None"}],parametersDescription:[{anchor:"diffusers.AceStepPipeline.__call__.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) &#x2014;
The prompt or prompts to guide music generation. Describes the style, genre, instruments, etc.`,name:"prompt"},{anchor:"diffusers.AceStepPipeline.__call__.lyrics",description:`<strong>lyrics</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>, defaults to <code>&quot;&quot;</code>) &#x2014;
The lyrics text for the music. Supports structured lyrics with tags like <code>[verse]</code>, <code>[chorus]</code>, etc.`,name:"lyrics"},{anchor:"diffusers.AceStepPipeline.__call__.audio_duration",description:`<strong>audio_duration</strong> (<code>float</code>, <em>optional</em>, defaults to 60.0) &#x2014;
Duration of the generated audio in seconds.`,name:"audio_duration"},{anchor:"diffusers.AceStepPipeline.__call__.vocal_language",description:`<strong>vocal_language</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>, defaults to <code>&quot;en&quot;</code>) &#x2014;
Language code for the lyrics (e.g., <code>&quot;en&quot;</code>, <code>&quot;zh&quot;</code>, <code>&quot;ja&quot;</code>).`,name:"vocal_language"},{anchor:"diffusers.AceStepPipeline.__call__.num_inference_steps",description:`<strong>num_inference_steps</strong> (<code>int</code>, <em>optional</em>, defaults to 8) &#x2014;
The number of denoising steps. The turbo model is designed for 8 steps.`,name:"num_inference_steps"},{anchor:"diffusers.AceStepPipeline.__call__.guidance_scale",description:`<strong>guidance_scale</strong> (<code>float</code>, <em>optional</em>, defaults to 7.0) &#x2014;
Guidance scale for classifier-free guidance. A value of 1.0 disables CFG.`,name:"guidance_scale"},{anchor:"diffusers.AceStepPipeline.__call__.shift",description:`<strong>shift</strong> (<code>float</code>, <em>optional</em>, defaults to 3.0) &#x2014;
Shift parameter for the timestep schedule (1.0, 2.0, or 3.0).`,name:"shift"},{anchor:"diffusers.AceStepPipeline.__call__.generator",description:`<strong>generator</strong> (<code>torch.Generator</code> or <code>List[torch.Generator]</code>, <em>optional</em>) &#x2014;
A generator to make generation deterministic.`,name:"generator"},{anchor:"diffusers.AceStepPipeline.__call__.latents",description:`<strong>latents</strong> (<code>torch.Tensor</code>, <em>optional</em>) &#x2014;
Pre-generated noise latents of shape <code>(batch_size, latent_length, acoustic_dim)</code>.`,name:"latents"},{anchor:"diffusers.AceStepPipeline.__call__.output_type",description:`<strong>output_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;pt&quot;</code>) &#x2014;
Output format. <code>&quot;pt&quot;</code> for PyTorch tensor, <code>&quot;np&quot;</code> for NumPy array, <code>&quot;latent&quot;</code> for raw latents.`,name:"output_type"},{anchor:"diffusers.AceStepPipeline.__call__.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) &#x2014;
Whether to return an <code>AudioPipelineOutput</code> or a plain tuple.`,name:"return_dict"},{anchor:"diffusers.AceStepPipeline.__call__.callback",description:`<strong>callback</strong> (<code>Callable</code>, <em>optional</em>) &#x2014;
A function called every <code>callback_steps</code> steps with <code>(step, timestep, latents)</code>.`,name:"callback"},{anchor:"diffusers.AceStepPipeline.__call__.callback_steps",description:`<strong>callback_steps</strong> (<code>int</code>, <em>optional</em>, defaults to 1) &#x2014;
Frequency of the callback function.`,name:"callback_steps"},{anchor:"diffusers.AceStepPipeline.__call__.callback_on_step_end",description:`<strong>callback_on_step_end</strong> (<code>Callable</code>, <em>optional</em>) &#x2014;
A function that is called at the end of each denoising step during inference. The function is called
with the following arguments: <code>callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)</code>. <code>callback_kwargs</code> will include a list of all tensors as specified by
<code>callback_on_step_end_tensor_inputs</code>.`,name:"callback_on_step_end"},{anchor:"diffusers.AceStepPipeline.__call__.callback_on_step_end_tensor_inputs",description:`<strong>callback_on_step_end_tensor_inputs</strong> (<code>List</code>, <em>optional</em>) &#x2014;
The list of tensor inputs for the <code>callback_on_step_end</code> function. The tensors specified in the list
will be passed as <code>callback_kwargs</code> argument. You will only be able to include variables listed in the
<code>._callback_tensor_inputs</code> attribute of your pipeline class.`,name:"callback_on_step_end_tensor_inputs"},{anchor:"diffusers.AceStepPipeline.__call__.instruction",description:`<strong>instruction</strong> (<code>str</code>, <em>optional</em>) &#x2014;
Custom instruction text for the generation task. If not provided, it is auto-generated based on
<code>task_type</code>.`,name:"instruction"},{anchor:"diffusers.AceStepPipeline.__call__.max_text_length",description:`<strong>max_text_length</strong> (<code>int</code>, <em>optional</em>, defaults to 256) &#x2014;
Maximum token length for text prompt encoding.`,name:"max_text_length"},{anchor:"diffusers.AceStepPipeline.__call__.max_lyric_length",description:`<strong>max_lyric_length</strong> (<code>int</code>, <em>optional</em>, defaults to 2048) &#x2014;
Maximum token length for lyrics encoding.`,name:"max_lyric_length"},{anchor:"diffusers.AceStepPipeline.__call__.bpm",description:`<strong>bpm</strong> (<code>int</code>, <em>optional</em>) &#x2014;
BPM (beats per minute) for music metadata. If <code>None</code>, the model estimates it.`,name:"bpm"},{anchor:"diffusers.AceStepPipeline.__call__.keyscale",description:`<strong>keyscale</strong> (<code>str</code>, <em>optional</em>) &#x2014;
Musical key (e.g., <code>&quot;C major&quot;</code>, <code>&quot;A minor&quot;</code>). If <code>None</code>, the model estimates it.`,name:"keyscale"},{anchor:"diffusers.AceStepPipeline.__call__.timesignature",description:`<strong>timesignature</strong> (<code>str</code>, <em>optional</em>) &#x2014;
Time signature (e.g., <code>&quot;4&quot;</code> for 4/4, <code>&quot;3&quot;</code> for 3/4). If <code>None</code>, the model estimates it.`,name:"timesignature"},{anchor:"diffusers.AceStepPipeline.__call__.task_type",description:`<strong>task_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;text2music&quot;</code>) &#x2014;
The generation task type. One of <code>&quot;text2music&quot;</code>, <code>&quot;cover&quot;</code>, <code>&quot;repaint&quot;</code>, <code>&quot;extract&quot;</code>, <code>&quot;lego&quot;</code>,
<code>&quot;complete&quot;</code>.`,name:"task_type"},{anchor:"diffusers.AceStepPipeline.__call__.track_name",description:`<strong>track_name</strong> (<code>str</code>, <em>optional</em>) &#x2014;
Track name for <code>&quot;extract&quot;</code> or <code>&quot;lego&quot;</code> tasks (e.g., <code>&quot;vocals&quot;</code>, <code>&quot;drums&quot;</code>).`,name:"track_name"},{anchor:"diffusers.AceStepPipeline.__call__.complete_track_classes",description:`<strong>complete_track_classes</strong> (<code>List[str]</code>, <em>optional</em>) &#x2014;
Track classes for the <code>&quot;complete&quot;</code> task.`,name:"complete_track_classes"},{anchor:"diffusers.AceStepPipeline.__call__.src_audio",description:`<strong>src_audio</strong> (<code>torch.Tensor</code>, <em>optional</em>) &#x2014;
Source audio tensor of shape <code>[channels, samples]</code> at 48kHz for audio-to-audio tasks (repaint, lego,
cover, extract, complete). The audio is encoded through the VAE to produce source latents.`,name:"src_audio"},{anchor:"diffusers.AceStepPipeline.__call__.reference_audio",description:`<strong>reference_audio</strong> (<code>torch.Tensor</code>, <em>optional</em>) &#x2014;
Reference audio tensor of shape <code>[channels, samples]</code> at 48kHz for timbre conditioning. Used to extract
timbre features for style transfer.`,name:"reference_audio"},{anchor:"diffusers.AceStepPipeline.__call__.audio_codes",description:`<strong>audio_codes</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) &#x2014;
Audio semantic code strings (e.g. <code>&quot;&lt;|audio_code_123|&gt;&lt;|audio_code_456|&gt;...&quot;</code>). When provided, the task
is automatically switched to <code>&quot;cover&quot;</code> mode and the registered ACE-Step audio tokenizer / detokenizer
modules decode the 5 Hz codes into 25 Hz acoustic conditioning.`,name:"audio_codes"},{anchor:"diffusers.AceStepPipeline.__call__.repainting_start",description:`<strong>repainting_start</strong> (<code>float</code>, <em>optional</em>) &#x2014;
Start time in seconds for the repaint region (for <code>&quot;repaint&quot;</code> and <code>&quot;lego&quot;</code> tasks).`,name:"repainting_start"},{anchor:"diffusers.AceStepPipeline.__call__.repainting_end",description:`<strong>repainting_end</strong> (<code>float</code>, <em>optional</em>) &#x2014;
End time in seconds for the repaint region. Use <code>-1</code> or <code>None</code> for until end.`,name:"repainting_end"},{anchor:"diffusers.AceStepPipeline.__call__.audio_cover_strength",description:`<strong>audio_cover_strength</strong> (<code>float</code>, <em>optional</em>, defaults to 1.0) &#x2014;
Strength of audio cover blending (0.0 to 1.0). When &lt; 1.0, blends cover-conditioned and
text-only-conditioned outputs. Lower values produce more style transfer effect.`,name:"audio_cover_strength"},{anchor:"diffusers.AceStepPipeline.__call__.cfg_interval_start",description:`<strong>cfg_interval_start</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) &#x2014;
Start ratio (0.0-1.0) of the timestep range where CFG is applied.`,name:"cfg_interval_start"},{anchor:"diffusers.AceStepPipeline.__call__.cfg_interval_end",description:`<strong>cfg_interval_end</strong> (<code>float</code>, <em>optional</em>, defaults to 1.0) &#x2014;
End ratio (0.0-1.0) of the timestep range where CFG is applied.`,name:"cfg_interval_end"},{anchor:"diffusers.AceStepPipeline.__call__.timesteps",description:`<strong>timesteps</strong> (<code>List[float]</code>, <em>optional</em>) &#x2014;
Custom timestep schedule. If provided, overrides <code>num_inference_steps</code> and <code>shift</code>.`,name:"timesteps"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L779",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>
<p>If <code>return_dict</code> is <code>True</code>, an <code>AudioPipelineOutput</code> is returned, otherwise a tuple with the generated
audio.</p>
`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>
<p><a
href="/docs/diffusers/pr_13745/en/api/pipelines/audioldm2#diffusers.AudioPipelineOutput"
>AudioPipelineOutput</a> or <code>tuple</code></p>
`}}),k=new zt({props:{anchor:"diffusers.AceStepPipeline.__call__.example",$$slots:{default:[Lt]},$$scope:{ctx:Me}}}),Y=new oe({props:{name:"check_inputs",anchor:"diffusers.AceStepPipeline.check_inputs",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]]"},{name:"lyrics",val:": typing.Union[str, typing.List[str]]"},{name:"task_type",val:": str"},{name:"num_inference_steps",val:": int"},{name:"guidance_scale",val:": float"},{name:"shift",val:": float"},{name:"audio_cover_strength",val:": float"},{name:"cfg_interval_start",val:": float"},{name:"cfg_interval_end",val:": float"},{name:"repainting_start",val:": typing.Optional[float]"},{name:"repainting_end",val:": typing.Optional[float]"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L227"}}),O=new oe({props:{name:"encode_prompt",anchor:"diffusers.AceStepPipeline.encode_prompt",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]]"},{name:"lyrics",val:": typing.Union[str, typing.List[str]]"},{name:"device",val:": device"},{name:"vocal_language",val:": typing.Union[str, typing.List[str]] = 'en'"},{name:"audio_duration",val:": float = 60.0"},{name:"instruction",val:": typing.Optional[str] = None"},{name:"bpm",val:": typing.Optional[int] = None"},{name:"keyscale",val:": typing.Optional[str] = None"},{name:"timesignature",val:": typing.Optional[str] = None"},{name:"max_text_length",val:": int = 256"},{name:"max_lyric_length",val:": int = 2048"}],parametersDescription:[{anchor:"diffusers.AceStepPipeline.encode_prompt.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>) &#x2014;
Text caption(s) describing the music.`,name:"prompt"},{anchor:"diffusers.AceStepPipeline.encode_prompt.lyrics",description:`<strong>lyrics</strong> (<code>str</code> or <code>List[str]</code>) &#x2014;
Lyric text(s).`,name:"lyrics"},{anchor:"diffusers.AceStepPipeline.encode_prompt.device",description:`<strong>device</strong> (<code>torch.device</code>) &#x2014;
Device for tensors.`,name:"device"},{anchor:"diffusers.AceStepPipeline.encode_prompt.vocal_language",description:`<strong>vocal_language</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>, defaults to <code>&quot;en&quot;</code>) &#x2014;
Language code(s) for lyrics.`,name:"vocal_language"},{anchor:"diffusers.AceStepPipeline.encode_prompt.audio_duration",description:`<strong>audio_duration</strong> (<code>float</code>, <em>optional</em>, defaults to 60.0) &#x2014;
Duration of the audio in seconds.`,name:"audio_duration"},{anchor:"diffusers.AceStepPipeline.encode_prompt.instruction",description:`<strong>instruction</strong> (<code>str</code>, <em>optional</em>) &#x2014;
Instruction text for generation.`,name:"instruction"},{anchor:"diffusers.AceStepPipeline.encode_prompt.bpm",description:`<strong>bpm</strong> (<code>int</code>, <em>optional</em>) &#x2014;
BPM (beats per minute) for metadata.`,name:"bpm"},{anchor:"diffusers.AceStepPipeline.encode_prompt.keyscale",description:`<strong>keyscale</strong> (<code>str</code>, <em>optional</em>) &#x2014;
Musical key (e.g., <code>&quot;C major&quot;</code>).`,name:"keyscale"},{anchor:"diffusers.AceStepPipeline.encode_prompt.timesignature",description:`<strong>timesignature</strong> (<code>str</code>, <em>optional</em>) &#x2014;
Time signature (e.g., <code>&quot;4&quot;</code> for 4/4).`,name:"timesignature"},{anchor:"diffusers.AceStepPipeline.encode_prompt.max_text_length",description:`<strong>max_text_length</strong> (<code>int</code>, <em>optional</em>, defaults to 256) &#x2014;
Maximum token length for text prompts.`,name:"max_text_length"},{anchor:"diffusers.AceStepPipeline.encode_prompt.max_lyric_length",description:`<strong>max_lyric_length</strong> (<code>int</code>, <em>optional</em>, defaults to 2048) &#x2014;
Maximum token length for lyrics.`,name:"max_lyric_length"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L396",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>
<p>Tuple of <code>(text_hidden_states, text_attention_mask, lyric_hidden_states, lyric_attention_mask)</code>.</p>
`}}),K=new oe({props:{name:"prepare_latents",anchor:"diffusers.AceStepPipeline.prepare_latents",parameters:[{name:"batch_size",val:": int"},{name:"audio_duration",val:": float"},{name:"dtype",val:": dtype"},{name:"device",val:": device"},{name:"generator",val:": typing.Union[torch._C.Generator, typing.List[torch._C.Generator], NoneType] = None"},{name:"latents",val:": typing.Optional[torch.Tensor] = None"}],parametersDescription:[{anchor:"diffusers.AceStepPipeline.prepare_latents.batch_size",description:"<strong>batch_size</strong> (<code>int</code>) &#x2014; Number of samples to generate.",name:"batch_size"},{anchor:"diffusers.AceStepPipeline.prepare_latents.audio_duration",description:"<strong>audio_duration</strong> (<code>float</code>) &#x2014; Duration of audio in seconds.",name:"audio_duration"},{anchor:"diffusers.AceStepPipeline.prepare_latents.dtype",description:"<strong>dtype</strong> (<code>torch.dtype</code>) &#x2014; Data type for the latents.",name:"dtype"},{anchor:"diffusers.AceStepPipeline.prepare_latents.device",description:"<strong>device</strong> (<code>torch.device</code>) &#x2014; Device for the latents.",name:"device"},{anchor:"diffusers.AceStepPipeline.prepare_latents.generator",description:"<strong>generator</strong> (<code>torch.Generator</code> or <code>List[torch.Generator]</code>, <em>optional</em>) &#x2014; Random number generator(s).",name:"generator"},{anchor:"diffusers.AceStepPipeline.prepare_latents.latents",description:"<strong>latents</strong> (<code>torch.Tensor</code>, <em>optional</em>) &#x2014; Pre-generated latents.",name:"latents"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L501",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>
<p>Noise latents of shape <code>(batch_size, latent_length, acoustic_dim)</code>.</p>
`}}),ee=new oe({props:{name:"prepare_reference_audio_latents",anchor:"diffusers.AceStepPipeline.prepare_reference_audio_latents",parameters:[{name:"reference_audio",val:": Tensor"},{name:"batch_size",val:": int"},{name:"device",val:": device"},{name:"dtype",val:": dtype"}],parametersDescription:[{anchor:"diffusers.AceStepPipeline.prepare_reference_audio_latents.reference_audio",description:`<strong>reference_audio</strong> (<code>torch.Tensor</code>) &#x2014; Reference audio tensor of shape <code>[channels, samples]</code> at
<code>self.sample_rate</code>.`,name:"reference_audio"},{anchor:"diffusers.AceStepPipeline.prepare_reference_audio_latents.batch_size",description:"<strong>batch_size</strong> (<code>int</code>) &#x2014; Batch size.",name:"batch_size"},{anchor:"diffusers.AceStepPipeline.prepare_reference_audio_latents.device",description:"<strong>device</strong> (<code>torch.device</code>) &#x2014; Target device.",name:"device"},{anchor:"diffusers.AceStepPipeline.prepare_reference_audio_latents.dtype",description:"<strong>dtype</strong> (<code>torch.dtype</code>) &#x2014; Target dtype.",name:"dtype"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L575",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>
<p>Tuple of <code>(refer_audio_acoustic, refer_audio_order_mask)</code>.</p>
`}}),te=new oe({props:{name:"prepare_src_latents",anchor:"diffusers.AceStepPipeline.prepare_src_latents",parameters:[{name:"device",val:": device"},{name:"dtype",val:": dtype"},{name:"batch_size",val:": int = 1"},{name:"src_audio",val:": typing.Optional[torch.Tensor] = None"},{name:"audio_codes",val:": typing.Union[str, typing.List[str], NoneType] = None"},{name:"latent_length",val:": typing.Optional[int] = None"},{name:"task_type",val:": str = 'text2music'"}],parametersDescription:[{anchor:"diffusers.AceStepPipeline.prepare_src_latents.src_audio",description:`<strong>src_audio</strong> (<code>torch.Tensor</code>, <em>optional</em>) &#x2014; Source audio tensor of shape <code>[channels, samples]</code> at
<code>self.sample_rate</code>.`,name:"src_audio"},{anchor:"diffusers.AceStepPipeline.prepare_src_latents.audio_codes",description:"<strong>audio_codes</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) &#x2014; Audio semantic code strings.",name:"audio_codes"},{anchor:"diffusers.AceStepPipeline.prepare_src_latents.latent_length",description:"<strong>latent_length</strong> (<code>int</code>, <em>optional</em>) &#x2014; Target latent length when no source audio or audio codes are given.",name:"latent_length"},{anchor:"diffusers.AceStepPipeline.prepare_src_latents.device",description:"<strong>device</strong> (<code>torch.device</code>) &#x2014; Target device.",name:"device"},{anchor:"diffusers.AceStepPipeline.prepare_src_latents.dtype",description:"<strong>dtype</strong> (<code>torch.dtype</code>) &#x2014; Target dtype.",name:"dtype"},{anchor:"diffusers.AceStepPipeline.prepare_src_latents.batch_size",description:"<strong>batch_size</strong> (<code>int</code>) &#x2014; Batch size.",name:"batch_size"},{anchor:"diffusers.AceStepPipeline.prepare_src_latents.task_type",description:"<strong>task_type</strong> (<code>str</code>) &#x2014; Current task type.",name:"task_type"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L628",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>
<p>Tuple of <code>(src_latents, latent_length)</code> where <code>src_latents</code> has shape <code>[batch, T, D]</code>.</p>
`}}),ne=new Dt({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/ace_step.md"}}),{c(){m=r("meta"),P=o(),T=r("p"),b=o(),f(M.$$.fragment),u=o(),f(x.$$.fragment),xe=o(),q=r("p"),q.innerHTML=pt,Te=o(),G=r("p"),G.innerHTML=ut,we=o(),I=r("p"),I.textContent=mt,Ue=o(),B=r("p"),B.innerHTML=ft,Ce=o(),f(E.$$.fragment),Je=o(),Z=r("p"),Z.textContent=gt,Se=o(),N=r("table"),N.innerHTML=ht,ke=o(),V=r("p"),V.innerHTML=_t,je=o(),f(Q.$$.fragment),Ae=o(),z=r("p"),z.textContent=yt,$e=o(),D=r("ul"),D.innerHTML=vt,Pe=o(),L=r("p"),L.textContent=bt,qe=o(),H=r("ul"),H.innerHTML=Mt,Ge=o(),f(W.$$.fragment),Ie=o(),f(R.$$.fragment),Be=o(),l=r("div"),f(F.$$.fragment),Le=o(),se=r("p"),se.textContent=xt,He=o(),ae=r("p"),ae.innerHTML=Tt,We=o(),ie=r("p"),ie.textContent=wt,Re=o(),re=r("p"),re.textContent=Ut,Fe=o(),ce=r("ul"),ce.innerHTML=Ct,Xe=o(),w=r("div"),f(X.$$.fragment),Ye=o(),le=r("p"),le.textContent=Jt,Oe=o(),f(k.$$.fragment),Ke=o(),j=r("div"),f(Y.$$.fragment),et=o(),de=r("p"),de.textContent=St,tt=o(),U=r("div"),f(O.$$.fragment),nt=o(),pe=r("p"),pe.textContent=kt,ot=o(),ue=r("p"),ue.textContent=jt,st=o(),A=r("div"),f(K.$$.fragment),at=o(),me=r("p"),me.textContent=At,it=o(),C=r("div"),f(ee.$$.fragment),rt=o(),fe=r("p"),fe.textContent=$t,ct=o(),ge=r("p"),ge.textContent=Pt,lt=o(),$=r("div"),f(te.$$.fragment),dt=o(),he=r("p"),he.textContent=qt,Ee=o(),f(ne.$$.fragment),Ze=o(),be=r("p"),this.h()},l(e){const t=Vt("svelte-u9bgzb",document.head);m=c(t,"META",{name:!0,content:!0}),t.forEach(n),P=s(e),T=c(e,"P",{}),J(T).forEach(n),b=s(e),g(M.$$.fragment,e),u=s(e),g(x.$$.fragment,e),xe=s(e),q=c(e,"P",{"data-svelte-h":!0}),p(q)!=="svelte-1xmbfz5"&&(q.innerHTML=pt),Te=s(e),G=c(e,"P",{"data-svelte-h":!0}),p(G)!=="svelte-19liucp"&&(G.innerHTML=ut),we=s(e),I=c(e,"P",{"data-svelte-h":!0}),p(I)!=="svelte-1jpmgzg"&&(I.textContent=mt),Ue=s(e),B=c(e,"P",{"data-svelte-h":!0}),p(B)!=="svelte-nix58r"&&(B.innerHTML=ft),Ce=s(e),g(E.$$.fragment,e),Je=s(e),Z=c(e,"P",{"data-svelte-h":!0}),p(Z)!=="svelte-1n2ep4u"&&(Z.textContent=gt),Se=s(e),N=c(e,"TABLE",{"data-svelte-h":!0}),p(N)!=="svelte-234pxd"&&(N.innerHTML=ht),ke=s(e),V=c(e,"P",{"data-svelte-h":!0}),p(V)!=="svelte-14cgle6"&&(V.innerHTML=_t),je=s(e),g(Q.$$.fragment,e),Ae=s(e),z=c(e,"P",{"data-svelte-h":!0}),p(z)!=="svelte-1dvtu0c"&&(z.textContent=yt),$e=s(e),D=c(e,"UL",{"data-svelte-h":!0}),p(D)!=="svelte-igfv1y"&&(D.innerHTML=vt),Pe=s(e),L=c(e,"P",{"data-svelte-h":!0}),p(L)!=="svelte-1g0t9wk"&&(L.textContent=bt),qe=s(e),H=c(e,"UL",{"data-svelte-h":!0}),p(H)!=="svelte-zawy12"&&(H.innerHTML=Mt),Ge=s(e),g(W.$$.fragment,e),Ie=s(e),g(R.$$.fragment,e),Be=s(e),l=c(e,"DIV",{class:!0});var d=J(l);g(F.$$.fragment,d),Le=s(d),se=c(d,"P",{"data-svelte-h":!0}),p(se)!=="svelte-19idt02"&&(se.textContent=xt),He=s(d),ae=c(d,"P",{"data-svelte-h":!0}),p(ae)!=="svelte-k5j8ly"&&(ae.innerHTML=Tt),We=s(d),ie=c(d,"P",{"data-svelte-h":!0}),p(ie)!=="svelte-xc6yjp"&&(ie.textContent=wt),Re=s(d),re=c(d,"P",{"data-svelte-h":!0}),p(re)!=="svelte-1w6yaag"&&(re.textContent=Ut),Fe=s(d),ce=c(d,"UL",{"data-svelte-h":!0}),p(ce)!=="svelte-1piw6h1"&&(ce.innerHTML=Ct),Xe=s(d),w=c(d,"DIV",{class:!0});var _e=J(w);g(X.$$.fragment,_e),Ye=s(_e),le=c(_e,"P",{"data-svelte-h":!0}),p(le)!=="svelte-1liw69f"&&(le.textContent=Jt),Oe=s(_e),g(k.$$.fragment,_e),_e.forEach(n),Ke=s(d),j=c(d,"DIV",{class:!0});var Ve=J(j);g(Y.$$.fragment,Ve),et=s(Ve),de=c(Ve,"P",{"data-svelte-h":!0}),p(de)!=="svelte-1i5h6w3"&&(de.textContent=St),Ve.forEach(n),tt=s(d),U=c(d,"DIV",{class:!0});var ye=J(U);g(O.$$.fragment,ye),nt=s(ye),pe=c(ye,"P",{"data-svelte-h":!0}),p(pe)!=="svelte-14s2s2t"&&(pe.textContent=kt),ot=s(ye),ue=c(ye,"P",{"data-svelte-h":!0}),p(ue)!=="svelte-l7jx7"&&(ue.textContent=jt),ye.forEach(n),st=s(d),A=c(d,"DIV",{class:!0});var Qe=J(A);g(K.$$.fragment,Qe),at=s(Qe),me=c(Qe,"P",{"data-svelte-h":!0}),p(me)!=="svelte-1kcgwsa"&&(me.textContent=At),Qe.forEach(n),it=s(d),C=c(d,"DIV",{class:!0});var ve=J(C);g(ee.$$.fragment,ve),rt=s(ve),fe=c(ve,"P",{"data-svelte-h":!0}),p(fe)!=="svelte-583sdx"&&(fe.textContent=$t),ct=s(ve),ge=c(ve,"P",{"data-svelte-h":!0}),p(ge)!=="svelte-17nvcr3"&&(ge.textContent=Pt),ve.forEach(n),lt=s(d),$=c(d,"DIV",{class:!0});var ze=J($);g(te.$$.fragment,ze),dt=s(ze),he=c(ze,"P",{"data-svelte-h":!0}),p(he)!=="svelte-wb5sxz"&&(he.textContent=qt),ze.forEach(n),d.forEach(n),Ee=s(e),g(ne.$$.fragment,e),Ze=s(e),be=c(e,"P",{}),J(be).forEach(n),this.h()},h(){S(m,"name","hf:doc:metadata"),S(m,"content",Wt),S(w,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),S(j,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),S(U,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),S(A,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),S(C,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),S($,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),S(l,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,t){a(document.head,m),i(e,P,t),i(e,T,t),i(e,b,t),h(M,e,t),i(e,u,t),h(x,e,t),i(e,xe,t),i(e,q,t),i(e,Te,t),i(e,G,t),i(e,we,t),i(e,I,t),i(e,Ue,t),i(e,B,t),i(e,Ce,t),h(E,e,t),i(e,Je,t),i(e,Z,t),i(e,Se,t),i(e,N,t),i(e,ke,t),i(e,V,t),i(e,je,t),h(Q,e,t),i(e,Ae,t),i(e,z,t),i(e,$e,t),i(e,D,t),i(e,Pe,t),i(e,L,t),i(e,qe,t),i(e,H,t),i(e,Ge,t),h(W,e,t),i(e,Ie,t),h(R,e,t),i(e,Be,t),i(e,l,t),h(F,l,null),a(l,Le),a(l,se),a(l,He),a(l,ae),a(l,We),a(l,ie),a(l,Re),a(l,re),a(l,Fe),a(l,ce),a(l,Xe),a(l,w),h(X,w,null),a(w,Ye),a(w,le),a(w,Oe),h(k,w,null),a(l,Ke),a(l,j),h(Y,j,null),a(j,et),a(j,de),a(l,tt),a(l,U),h(O,U,null),a(U,nt),a(U,pe),a(U,ot),a(U,ue),a(l,st),a(l,A),h(K,A,null),a(A,at),a(A,me),a(l,it),a(l,C),h(ee,C,null),a(C,rt),a(C,fe),a(C,ct),a(C,ge),a(l,lt),a(l,$),h(te,$,null),a($,dt),a($,he),i(e,Ee,t),h(ne,e,t),i(e,Ze,t),i(e,be,t),Ne=!0},p(e,[t]){const d={};t&2&&(d.$$scope={dirty:t,ctx:e}),k.$set(d)},i(e){Ne||(_(M.$$.fragment,e),_(x.$$.fragment,e),_(E.$$.fragment,e),_(Q.$$.fragment,e),_(W.$$.fragment,e),_(R.$$.fragment,e),_(F.$$.fragment,e),_(X.$$.fragment,e),_(k.$$.fragment,e),_(Y.$$.fragment,e),_(O.$$.fragment,e),_(K.$$.fragment,e),_(ee.$$.fragment,e),_(te.$$.fragment,e),_(ne.$$.fragment,e),Ne=!0)},o(e){y(M.$$.fragment,e),y(x.$$.fragment,e),y(E.$$.fragment,e),y(Q.$$.fragment,e),y(W.$$.fragment,e),y(R.$$.fragment,e),y(F.$$.fragment,e),y(X.$$.fragment,e),y(k.$$.fragment,e),y(Y.$$.fragment,e),y(O.$$.fragment,e),y(K.$$.fragment,e),y(ee.$$.fragment,e),y(te.$$.fragment,e),y(ne.$$.fragment,e),Ne=!1},d(e){e&&(n(P),n(T),n(b),n(u),n(xe),n(q),n(Te),n(G),n(we),n(I),n(Ue),n(B),n(Ce),n(Je),n(Z),n(Se),n(N),n(ke),n(V),n(je),n(Ae),n(z),n($e),n(D),n(Pe),n(L),n(qe),n(H),n(Ge),n(Ie),n(Be),n(l),n(Ee),n(Ze),n(be)),n(m),v(M,e),v(x,e),v(E,e),v(Q,e),v(W,e),v(R,e),v(F),v(X),v(k),v(Y),v(O),v(K),v(ee),v(te),v(ne,e)}}}const Wt='{"title":"ACE-Step 1.5","local":"ace-step-15","sections":[{"title":"Variants","local":"variants","sections":[],"depth":2},{"title":"Tips","local":"tips","sections":[],"depth":2},{"title":"AceStepPipeline","local":"diffusers.AceStepPipeline","sections":[],"depth":2}],"depth":1}';function Rt(Me){return Bt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class nn extends Zt{constructor(m){super(),Nt(this,m,Rt,Ht,It,{})}}export{nn as component};

Xet Storage Details

Size:
50.7 kB
·
Xet hash:
114c5917e89d99a9e16ab3aeb03964e19528c7f4d0b81210b8278624868a664b

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.