Buckets:
| import{s as It,o as Bt,n as Et}from"../chunks/scheduler.53228c21.js";import{S as Zt,i as Nt,e as r,s as o,c as f,h as Vt,a as c,d as n,b as s,f as J,g,j as p,k as S,l as a,m as i,n as h,t as _,o as y,p as v}from"../chunks/index.cac5d66a.js";import{C as Qt}from"../chunks/CopyLLMTxtMenu.4912207d.js";import{D as oe}from"../chunks/Docstring.1e7ac4f3.js";import{C as Gt}from"../chunks/CodeBlock.606cbaf4.js";import{E as zt}from"../chunks/ExampleCodeBlock.ccf7d2a9.js";import{H as De,E as Dt}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.323ee77a.js";function Lt(Me){let m,P="Examples:",T,b,M;return b=new Gt({props:{code:"aW1wb3J0JTIwdG9yY2glMEFpbXBvcnQlMjBzb3VuZGZpbGUlMjBhcyUyMHNmJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMEFjZVN0ZXBQaXBlbGluZSUwQSUwQXBpcGUlMjAlM0QlMjBBY2VTdGVwUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMkFDRS1TdGVwJTJGYWNlc3RlcC12MTUteGwtdHVyYm8tZGlmZnVzZXJzJTIyJTJDJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiUwQSklMEFwaXBlJTIwJTNEJTIwcGlwZS50byglMjJjdWRhJTIyKSUwQSUwQSUyMyUyMFRleHQtdG8tbXVzaWMlMjBnZW5lcmF0aW9uJTIwd2l0aCUyMG1ldGFkYXRhJTBBYXVkaW8lMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMHByb21wdCUzRCUyMkElMjBiZWF1dGlmdWwlMjBwaWFubyUyMHBpZWNlJTIwd2l0aCUyMHNvZnQlMjBtZWxvZGllcyUyMiUyQyUwQSUyMCUyMCUyMCUyMGx5cmljcyUzRCUyMiU1QnZlcnNlJTVEJTVDblNvZnQlMjBub3RlcyUyMGluJTIwdGhlJTIwbW9ybmluZyUyMGxpZ2h0JTVDbiU1QmNob3J1cyU1RCU1Q25NdXNpYyUyMGZpbGxzJTIwdGhlJTIwYWlyJTIwdG9uaWdodCUyMiUyQyUwQSUyMCUyMCUyMCUyMGF1ZGlvX2R1cmF0aW9uJTNEMzAuMCUyQyUwQSUyMCUyMCUyMCUyMG51bV9pbmZlcmVuY2Vfc3RlcHMlM0Q4JTJDJTBBJTIwJTIwJTIwJTIwYnBtJTNEMTIwJTJDJTBBJTIwJTIwJTIwJTIwa2V5c2NhbGUlM0QlMjJDJTIwbWFqb3IlMjIlMkMlMEElMjAlMjAlMjAlMjB0aW1lc2lnbmF0dXJlJTNEJTIyNCUyMiUyQyUwQSkuYXVkaW9zJTBBJTBBJTIzJTIwU2F2ZSUyMHRoZSUyMGdlbmVyYXRlZCUyMGF1ZGlvJTBBc2Yud3JpdGUoJTIyb3V0cHV0LndhdiUyMiUyQyUyMGF1ZGlvJTVCMCUyQyUyMDAlNUQuY3B1KCkubnVtcHkoKSUyQyUyMDQ4MDAwKSUwQSUwQSUyMyUyMFJlcGFpbnQlMjB0YXNrJTNBJTIwcmVnZW5lcmF0ZSUyMGElMjBzZWN0aW9uJTIwb2YlMjBleGlzdGluZyUyMHN0ZXJlbyUyMDQ4a0h6JTIwYXVkaW8lMEFzcmNfYXVkaW8lMkMlMjBzciUyMCUzRCUyMHNmLnJlYWQoJTIyaW5wdXQud2F2JTIyKSUwQXNyY19hdWRpbyUyMCUzRCUyMHRvcmNoLmZyb21fbnVtcHkoc3JjX2F1ZGlvKS5mbG9hdCgpLlQlMEFhdWRpbyUyMCUzRCUyMHBpcGUoJTBBJTIwJTIwJTIwJTIwcHJvbXB0JTNEJTIyRXBpYyUyMHJvY2slMjBndWl0YXIlMjBzb2xvJTIyJTJDJTBBJTIwJTIwJTIwJTIwbHlyaWNzJTNEJTIyJTIyJTJDJTBBJTIwJTIwJTIwJTIwdGFza190eXBlJTNEJTIycmVwYWludCUyMiUyQyUwQSUyMCUyMCUyMCUyMHNyY19hdWRpbyUzRHNyY19hdWRpbyUyQyUwQSUyMCUyMCUyMCUyMHJlcGFpbnRpbmdfc3RhcnQlM0QxMC4wJTJDJTBBJTIwJTIwJTIwJTIwcmVwYWludGluZ19lbmQlM0QyMC4wJTJDJTBBKS5hdWRpb3MlMEElMEElMjMlMjBDb3ZlciUyMHRhc2slMjB3aXRoJTIwcmVmZXJlbmNlJTIwYXVkaW8lMjBmb3IlMjB0aW1icmUlMjB0cmFuc2ZlciUwQXJlZl9hdWRpbyUyQyUyMHNyJTIwJTNEJTIwc2YucmVhZCglMjJyZWZlcmVuY2Uud2F2JTIyKSUwQXJlZl9hdWRpbyUyMCUzRCUyMHRvcmNoLmZyb21fbnVtcHkocmVmX2F1ZGlvKS5mbG9hdCgpLlQlMEFhdWRpbyUyMCUzRCUyMHBpcGUoJTBBJTIwJTIwJTIwJTIwcHJvbXB0JTNEJTIyUG9wJTIwc29uZyUyMHdpdGglMjBicmlnaHQlMjB2b2NhbHMlMjIlMkMlMEElMjAlMjAlMjAlMjBseXJpY3MlM0QlMjIlNUJ2ZXJzZSU1RCU1Q25IZWxsbyUyMHdvcmxkJTIyJTJDJTBBJTIwJTIwJTIwJTIwdGFza190eXBlJTNEJTIyY292ZXIlMjIlMkMlMEElMjAlMjAlMjAlMjByZWZlcmVuY2VfYXVkaW8lM0RyZWZfYXVkaW8lMkMlMEElMjAlMjAlMjAlMjBhdWRpb19jb3Zlcl9zdHJlbmd0aCUzRDAuOCUyQyUwQSkuYXVkaW9z",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> soundfile <span class="hljs-keyword">as</span> sf | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AceStepPipeline | |
| <span class="hljs-meta">>>> </span>pipe = AceStepPipeline.from_pretrained( | |
| <span class="hljs-meta">... </span> <span class="hljs-string">"ACE-Step/acestep-v15-xl-turbo-diffusers"</span>, torch_dtype=torch.bfloat16 | |
| <span class="hljs-meta">... </span>) | |
| <span class="hljs-meta">>>> </span>pipe = pipe.to(<span class="hljs-string">"cuda"</span>) | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># Text-to-music generation with metadata</span> | |
| <span class="hljs-meta">>>> </span>audio = pipe( | |
| <span class="hljs-meta">... </span> prompt=<span class="hljs-string">"A beautiful piano piece with soft melodies"</span>, | |
| <span class="hljs-meta">... </span> lyrics=<span class="hljs-string">"[verse]\\nSoft notes in the morning light\\n[chorus]\\nMusic fills the air tonight"</span>, | |
| <span class="hljs-meta">... </span> audio_duration=<span class="hljs-number">30.0</span>, | |
| <span class="hljs-meta">... </span> num_inference_steps=<span class="hljs-number">8</span>, | |
| <span class="hljs-meta">... </span> bpm=<span class="hljs-number">120</span>, | |
| <span class="hljs-meta">... </span> keyscale=<span class="hljs-string">"C major"</span>, | |
| <span class="hljs-meta">... </span> timesignature=<span class="hljs-string">"4"</span>, | |
| <span class="hljs-meta">... </span>).audios | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># Save the generated audio</span> | |
| <span class="hljs-meta">>>> </span>sf.write(<span class="hljs-string">"output.wav"</span>, audio[<span class="hljs-number">0</span>, <span class="hljs-number">0</span>].cpu().numpy(), <span class="hljs-number">48000</span>) | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># Repaint task: regenerate a section of existing stereo 48kHz audio</span> | |
| <span class="hljs-meta">>>> </span>src_audio, sr = sf.read(<span class="hljs-string">"input.wav"</span>) | |
| <span class="hljs-meta">>>> </span>src_audio = torch.from_numpy(src_audio).<span class="hljs-built_in">float</span>().T | |
| <span class="hljs-meta">>>> </span>audio = pipe( | |
| <span class="hljs-meta">... </span> prompt=<span class="hljs-string">"Epic rock guitar solo"</span>, | |
| <span class="hljs-meta">... </span> lyrics=<span class="hljs-string">""</span>, | |
| <span class="hljs-meta">... </span> task_type=<span class="hljs-string">"repaint"</span>, | |
| <span class="hljs-meta">... </span> src_audio=src_audio, | |
| <span class="hljs-meta">... </span> repainting_start=<span class="hljs-number">10.0</span>, | |
| <span class="hljs-meta">... </span> repainting_end=<span class="hljs-number">20.0</span>, | |
| <span class="hljs-meta">... </span>).audios | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># Cover task with reference audio for timbre transfer</span> | |
| <span class="hljs-meta">>>> </span>ref_audio, sr = sf.read(<span class="hljs-string">"reference.wav"</span>) | |
| <span class="hljs-meta">>>> </span>ref_audio = torch.from_numpy(ref_audio).<span class="hljs-built_in">float</span>().T | |
| <span class="hljs-meta">>>> </span>audio = pipe( | |
| <span class="hljs-meta">... </span> prompt=<span class="hljs-string">"Pop song with bright vocals"</span>, | |
| <span class="hljs-meta">... </span> lyrics=<span class="hljs-string">"[verse]\\nHello world"</span>, | |
| <span class="hljs-meta">... </span> task_type=<span class="hljs-string">"cover"</span>, | |
| <span class="hljs-meta">... </span> reference_audio=ref_audio, | |
| <span class="hljs-meta">... </span> audio_cover_strength=<span class="hljs-number">0.8</span>, | |
| <span class="hljs-meta">... </span>).audios`,lang:"py",wrap:!1}}),{c(){m=r("p"),m.textContent=P,T=o(),f(b.$$.fragment)},l(u){m=c(u,"P",{"data-svelte-h":!0}),p(m)!=="svelte-kvfsh7"&&(m.textContent=P),T=s(u),g(b.$$.fragment,u)},m(u,x){i(u,m,x),i(u,T,x),h(b,u,x),M=!0},p:Et,i(u){M||(_(b.$$.fragment,u),M=!0)},o(u){y(b.$$.fragment,u),M=!1},d(u){u&&(n(m),n(T)),v(b,u)}}}function Ht(Me){let m,P,T,b,M,u,x,xe,q,pt='ACE-Step 1.5 was introduced in <a href="https://arxiv.org/abs/2602.00744" rel="nofollow">ACE-Step 1.5: Pushing the Boundaries of Open-Source Music Generation</a> by the ACE-Step Team (ACE Studio and StepFun). It is an open-source music foundation model that generates commercial-grade stereo music with lyrics from text prompts.',Te,G,ut='ACE-Step 1.5 generates variable-length stereo audio at 48 kHz (10 seconds to 10 minutes) from text prompts and optional lyrics. The full system pairs a Language Model planner with a Diffusion Transformer (DiT) synthesizer; this pipeline wraps the DiT half of that stack, and consists of three components: an <a href="/docs/diffusers/pr_13745/en/api/models/autoencoder_oobleck#diffusers.AutoencoderOobleck">AutoencoderOobleck</a> VAE that compresses waveforms into 25 Hz stereo latents, a Qwen3-based text encoder for prompt and lyric conditioning, and an <a href="/docs/diffusers/pr_13745/en/api/models/ace_step_transformer#diffusers.AceStepTransformer1DModel">AceStepTransformer1DModel</a> DiT that operates in the VAE latent space using flow matching.',we,I,mt="The model supports 50+ languages for lyrics — including English, Chinese, Japanese, Korean, French, German, Spanish, Italian, Portuguese, and Russian — and runs on consumer GPUs (under 4 GB of VRAM when offloaded).",Ue,B,ft='This pipeline was contributed by the <a href="https://github.com/ace-step" rel="nofollow">ACE-Step Team</a>. The original codebase can be found at <a href="https://github.com/ace-step/ACE-Step-1.5" rel="nofollow">ace-step/ACE-Step-1.5</a>.',Ce,E,Je,Z,gt="ACE-Step 1.5 ships three DiT checkpoints that share the same transformer architecture but differ in guidance behavior; the pipeline auto-detects turbo checkpoints from the loaded transformer config and ignores CFG guidance for those guidance-distilled weights.",Se,N,ht='<thead><tr><th>Variant</th> <th align="center">CFG</th> <th align="center">Default steps</th> <th align="center">Default <code>guidance_scale</code></th> <th align="center">Default <code>shift</code></th> <th>HF repo</th></tr></thead> <tbody><tr><td><code>turbo</code> (guidance-distilled)</td> <td align="center">off</td> <td align="center">8</td> <td align="center">ignored</td> <td align="center">3.0</td> <td><a href="https://huggingface.co/ACE-Step/acestep-v15-xl-turbo-diffusers" rel="nofollow"><code>ACE-Step/acestep-v15-xl-turbo-diffusers</code></a></td></tr> <tr><td><code>base</code></td> <td align="center">on</td> <td align="center">8</td> <td align="center">7.0</td> <td align="center">3.0</td> <td><a href="https://huggingface.co/ACE-Step/acestep-v15-base" rel="nofollow"><code>ACE-Step/acestep-v15-base</code></a></td></tr> <tr><td><code>sft</code></td> <td align="center">on</td> <td align="center">8</td> <td align="center">7.0</td> <td align="center">3.0</td> <td><a href="https://huggingface.co/ACE-Step/acestep-v15-sft" rel="nofollow"><code>ACE-Step/acestep-v15-sft</code></a></td></tr></tbody>',ke,V,_t="Base and SFT use the learned <code>null_condition_emb</code> for classifier-free guidance (APG, not vanilla CFG). Users commonly override <code>num_inference_steps</code> to 30–60 on base/sft for higher quality.",je,Q,Ae,z,yt="When constructing a prompt, keep in mind:",$e,D,vt="<li>Descriptive prompt inputs work best; use adjectives to describe the music style, instruments, mood, and tempo.</li> <li>The prompt should describe the overall musical characteristics (e.g., “upbeat pop song with electric guitar and drums”).</li> <li>Lyrics should be structured with tags like <code>[verse]</code>, <code>[chorus]</code>, <code>[bridge]</code>, etc.</li>",Pe,L,bt="During inference:",qe,H,Mt="<li><code>num_inference_steps</code>, <code>guidance_scale</code>, and <code>shift</code> default to the values shown above. For turbo checkpoints, <code>guidance_scale > 1.0</code> is ignored with a warning because guidance is distilled into the weights.</li> <li>The <code>audio_duration</code> parameter controls the length of the generated music in seconds.</li> <li>The <code>vocal_language</code> parameter should match the language of the lyrics.</li> <li><code>pipe.sample_rate</code> and <code>pipe.latents_per_second</code> are sourced from the VAE config (48000 Hz and 25 fps for the released checkpoints).</li> <li>For audio-to-audio tasks, pass <code>src_audio</code> and <code>reference_audio</code> as preprocessed stereo tensors at <code>pipe.sample_rate</code>.</li> <li><code>flash</code> and <code>flash_hub</code> use FlashAttention’s native sliding-window support for ACE-Step’s self-attention and expect unpadded text batches. If a batched prompt contains padding, use <code>flash_varlen</code> or <code>flash_varlen_hub</code> instead. Single-prompt inference with <code>padding="longest"</code> is normally unpadded.</li>",Ge,W,Ie,R,Be,l,F,Le,se,xt="Pipeline for text-to-music generation using ACE-Step 1.5.",He,ae,Tt=`This model inherits from <a href="/docs/diffusers/pr_13745/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a>. Check the superclass documentation for the generic methods | |
| implemented for all pipelines (downloading, saving, running on a particular device, etc.).`,We,ie,wt=`The pipeline uses flow matching with a custom timestep schedule for the diffusion process. The turbo model variant | |
| uses 8 inference steps by default.`,Re,re,Ut="Supported task types:",Fe,ce,Ct="<li><code>"text2music"</code>: Generate music from text prompts and lyrics.</li> <li><code>"cover"</code>: Generate audio from source audio / semantic codes with timbre transfer from reference audio.</li> <li><code>"repaint"</code>: Regenerate a section of existing audio while keeping the rest.</li> <li><code>"extract"</code>: Extract a specific track (e.g., vocals, drums) from audio.</li> <li><code>"lego"</code>: Generate a specific track based on audio context.</li> <li><code>"complete"</code>: Complete an input audio with additional tracks.</li>",Xe,w,X,Ye,le,Jt="The call function to the pipeline for music generation.",Oe,k,Ke,j,Y,et,de,St="Validate user-facing arguments before we start allocating noise tensors.",tt,U,O,nt,pe,kt="Encode text prompts and lyrics into embeddings.",ot,ue,jt=`Text prompts are encoded through the full text encoder model to produce contextual hidden states. Lyrics are | |
| only passed through the text encoder’s embedding layer (token lookup), since the lyric encoder in the condition | |
| encoder handles the contextual encoding.`,st,A,K,at,me,At="Prepare initial noise latents for the flow matching process.",it,C,ee,rt,fe,$t="Process reference audio into acoustic latents for the timbre encoder.",ct,ge,Pt=`The reference audio is repeated/cropped to 30 seconds (3 segments of 10 seconds each from front, middle, and | |
| back), encoded through the VAE, and then transposed for the timbre encoder.`,lt,$,te,dt,he,qt="Prepare source latents for text-to-music and audio-to-audio tasks.",Ee,ne,Ze,be,Ne;return M=new Qt({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),x=new De({props:{title:"ACE-Step 1.5",local:"ace-step-15",headingTag:"h1"}}),E=new De({props:{title:"Variants",local:"variants",headingTag:"h2"}}),Q=new De({props:{title:"Tips",local:"tips",headingTag:"h2"}}),W=new Gt({props:{code:"aW1wb3J0JTIwdG9yY2glMEFpbXBvcnQlMjBzb3VuZGZpbGUlMjBhcyUyMHNmJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMEFjZVN0ZXBQaXBlbGluZSUwQSUwQXBpcGUlMjAlM0QlMjBBY2VTdGVwUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUyMkFDRS1TdGVwJTJGYWNlc3RlcC12MTUteGwtdHVyYm8tZGlmZnVzZXJzJTIyJTJDJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiklMEFwaXBlJTIwJTNEJTIwcGlwZS50byglMjJjdWRhJTIyKSUwQSUwQWF1ZGlvJTIwJTNEJTIwcGlwZSglMEElMjAlMjAlMjAlMjBwcm9tcHQlM0QlMjJBJTIwYmVhdXRpZnVsJTIwcGlhbm8lMjBwaWVjZSUyMHdpdGglMjBzb2Z0JTIwbWVsb2RpZXMlMjBhbmQlMjBnZW50bGUlMjByaHl0aG0lMjIlMkMlMEElMjAlMjAlMjAlMjBseXJpY3MlM0QlMjIlNUJ2ZXJzZSU1RCU1Q25Tb2Z0JTIwbm90ZXMlMjBpbiUyMHRoZSUyMG1vcm5pbmclMjBsaWdodCU1Q25EYW5jaW5nJTIwdGhyb3VnaCUyMHRoZSUyMGFpciUyMHNvJTIwYnJpZ2h0JTVDbiU1QmNob3J1cyU1RCU1Q25NdXNpYyUyMGZpbGxzJTIwdGhlJTIwYWlyJTIwdG9uaWdodCU1Q25FdmVyeSUyMG5vdGUlMjBmZWVscyUyMGp1c3QlMjByaWdodCUyMiUyQyUwQSUyMCUyMCUyMCUyMGF1ZGlvX2R1cmF0aW9uJTNEMzAuMCUyQyUwQSkuYXVkaW9zJTBBJTBBc2Yud3JpdGUoJTIyb3V0cHV0LndhdiUyMiUyQyUyMGF1ZGlvJTVCMCU1RC5ULmNwdSgpLmZsb2F0KCkubnVtcHkoKSUyQyUyMHBpcGUuc2FtcGxlX3JhdGUp",highlighted:`<span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">import</span> soundfile <span class="hljs-keyword">as</span> sf | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AceStepPipeline | |
| pipe = AceStepPipeline.from_pretrained(<span class="hljs-string">"ACE-Step/acestep-v15-xl-turbo-diffusers"</span>, torch_dtype=torch.bfloat16) | |
| pipe = pipe.to(<span class="hljs-string">"cuda"</span>) | |
| audio = pipe( | |
| prompt=<span class="hljs-string">"A beautiful piano piece with soft melodies and gentle rhythm"</span>, | |
| lyrics=<span class="hljs-string">"[verse]\\nSoft notes in the morning light\\nDancing through the air so bright\\n[chorus]\\nMusic fills the air tonight\\nEvery note feels just right"</span>, | |
| audio_duration=<span class="hljs-number">30.0</span>, | |
| ).audios | |
| sf.write(<span class="hljs-string">"output.wav"</span>, audio[<span class="hljs-number">0</span>].T.cpu().<span class="hljs-built_in">float</span>().numpy(), pipe.sample_rate)`,lang:"python",wrap:!1}}),R=new De({props:{title:"AceStepPipeline",local:"diffusers.AceStepPipeline",headingTag:"h2"}}),F=new oe({props:{name:"class diffusers.AceStepPipeline",anchor:"diffusers.AceStepPipeline",parameters:[{name:"vae",val:": AutoencoderOobleck"},{name:"text_encoder",val:": PreTrainedModel"},{name:"tokenizer",val:": TokenizersBackend"},{name:"transformer",val:": AceStepTransformer1DModel"},{name:"condition_encoder",val:": AceStepConditionEncoder"},{name:"scheduler",val:": FlowMatchEulerDiscreteScheduler"},{name:"audio_tokenizer",val:": typing.Optional[diffusers.pipelines.ace_step.modeling_ace_step.AceStepAudioTokenizer] = None"},{name:"audio_token_detokenizer",val:": typing.Optional[diffusers.pipelines.ace_step.modeling_ace_step.AceStepAudioTokenDetokenizer] = None"}],parametersDescription:[{anchor:"diffusers.AceStepPipeline.vae",description:`<strong>vae</strong> (<a href="/docs/diffusers/pr_13745/en/api/models/autoencoder_oobleck#diffusers.AutoencoderOobleck">AutoencoderOobleck</a>) — | |
| Variational Auto-Encoder (VAE) model to encode and decode audio waveforms to and from latent | |
| representations.`,name:"vae"},{anchor:"diffusers.AceStepPipeline.text_encoder",description:`<strong>text_encoder</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModel" rel="nofollow">AutoModel</a>) — | |
| Text encoder model (e.g., Qwen3-Embedding-0.6B) for encoding text prompts and lyrics.`,name:"text_encoder"},{anchor:"diffusers.AceStepPipeline.tokenizer",description:`<strong>tokenizer</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoTokenizer" rel="nofollow">AutoTokenizer</a>) — | |
| Tokenizer for the text encoder.`,name:"tokenizer"},{anchor:"diffusers.AceStepPipeline.transformer",description:`<strong>transformer</strong> (<a href="/docs/diffusers/pr_13745/en/api/models/ace_step_transformer#diffusers.AceStepTransformer1DModel">AceStepTransformer1DModel</a>) — | |
| The Diffusion Transformer (DiT) model for denoising audio latents.`,name:"transformer"},{anchor:"diffusers.AceStepPipeline.condition_encoder",description:`<strong>condition_encoder</strong> (<code>AceStepConditionEncoder</code>) — | |
| Condition encoder that combines text, lyric, and timbre embeddings for cross-attention.`,name:"condition_encoder"},{anchor:"diffusers.AceStepPipeline.scheduler",description:`<strong>scheduler</strong> (<a href="/docs/diffusers/pr_13745/en/api/schedulers/flow_match_euler_discrete#diffusers.FlowMatchEulerDiscreteScheduler">FlowMatchEulerDiscreteScheduler</a>) — | |
| Flow-matching Euler scheduler. ACE-Step feeds the DiT timesteps in <code>[0, 1]</code>, so the scheduler is configured | |
| with <code>num_train_timesteps=1</code> and <code>shift=1.0</code> — the pipeline computes its shifted / turbo sigma schedule | |
| itself and passes it via <code>set_timesteps(sigmas=...)</code>.`,name:"scheduler"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L132"}}),X=new oe({props:{name:"__call__",anchor:"diffusers.AceStepPipeline.__call__",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]] = None"},{name:"lyrics",val:": typing.Union[str, typing.List[str]] = ''"},{name:"audio_duration",val:": float = 60.0"},{name:"vocal_language",val:": typing.Union[str, typing.List[str]] = 'en'"},{name:"num_inference_steps",val:": int = 8"},{name:"guidance_scale",val:": float = 7.0"},{name:"shift",val:": float = 3.0"},{name:"generator",val:": typing.Union[torch._C.Generator, typing.List[torch._C.Generator], NoneType] = None"},{name:"latents",val:": typing.Optional[torch.Tensor] = None"},{name:"output_type",val:": typing.Optional[str] = 'pt'"},{name:"return_dict",val:": bool = True"},{name:"callback",val:": typing.Optional[typing.Callable[[int, int, torch.Tensor], NoneType]] = None"},{name:"callback_steps",val:": typing.Optional[int] = 1"},{name:"callback_on_step_end",val:": typing.Optional[typing.Callable[..., dict]] = None"},{name:"callback_on_step_end_tensor_inputs",val:": typing.List[str] = ('latents',)"},{name:"instruction",val:": typing.Optional[str] = None"},{name:"max_text_length",val:": int = 256"},{name:"max_lyric_length",val:": int = 2048"},{name:"bpm",val:": typing.Optional[int] = None"},{name:"keyscale",val:": typing.Optional[str] = None"},{name:"timesignature",val:": typing.Optional[str] = None"},{name:"task_type",val:": str = 'text2music'"},{name:"track_name",val:": typing.Optional[str] = None"},{name:"complete_track_classes",val:": typing.Optional[typing.List[str]] = None"},{name:"src_audio",val:": typing.Optional[torch.Tensor] = None"},{name:"reference_audio",val:": typing.Optional[torch.Tensor] = None"},{name:"audio_codes",val:": typing.Union[str, typing.List[str], NoneType] = None"},{name:"repainting_start",val:": typing.Optional[float] = None"},{name:"repainting_end",val:": typing.Optional[float] = None"},{name:"audio_cover_strength",val:": float = 1.0"},{name:"cfg_interval_start",val:": float = 0.0"},{name:"cfg_interval_end",val:": float = 1.0"},{name:"timesteps",val:": typing.Optional[typing.List[float]] = None"}],parametersDescription:[{anchor:"diffusers.AceStepPipeline.__call__.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) — | |
| The prompt or prompts to guide music generation. Describes the style, genre, instruments, etc.`,name:"prompt"},{anchor:"diffusers.AceStepPipeline.__call__.lyrics",description:`<strong>lyrics</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>, defaults to <code>""</code>) — | |
| The lyrics text for the music. Supports structured lyrics with tags like <code>[verse]</code>, <code>[chorus]</code>, etc.`,name:"lyrics"},{anchor:"diffusers.AceStepPipeline.__call__.audio_duration",description:`<strong>audio_duration</strong> (<code>float</code>, <em>optional</em>, defaults to 60.0) — | |
| Duration of the generated audio in seconds.`,name:"audio_duration"},{anchor:"diffusers.AceStepPipeline.__call__.vocal_language",description:`<strong>vocal_language</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>, defaults to <code>"en"</code>) — | |
| Language code for the lyrics (e.g., <code>"en"</code>, <code>"zh"</code>, <code>"ja"</code>).`,name:"vocal_language"},{anchor:"diffusers.AceStepPipeline.__call__.num_inference_steps",description:`<strong>num_inference_steps</strong> (<code>int</code>, <em>optional</em>, defaults to 8) — | |
| The number of denoising steps. The turbo model is designed for 8 steps.`,name:"num_inference_steps"},{anchor:"diffusers.AceStepPipeline.__call__.guidance_scale",description:`<strong>guidance_scale</strong> (<code>float</code>, <em>optional</em>, defaults to 7.0) — | |
| Guidance scale for classifier-free guidance. A value of 1.0 disables CFG.`,name:"guidance_scale"},{anchor:"diffusers.AceStepPipeline.__call__.shift",description:`<strong>shift</strong> (<code>float</code>, <em>optional</em>, defaults to 3.0) — | |
| Shift parameter for the timestep schedule (1.0, 2.0, or 3.0).`,name:"shift"},{anchor:"diffusers.AceStepPipeline.__call__.generator",description:`<strong>generator</strong> (<code>torch.Generator</code> or <code>List[torch.Generator]</code>, <em>optional</em>) — | |
| A generator to make generation deterministic.`,name:"generator"},{anchor:"diffusers.AceStepPipeline.__call__.latents",description:`<strong>latents</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated noise latents of shape <code>(batch_size, latent_length, acoustic_dim)</code>.`,name:"latents"},{anchor:"diffusers.AceStepPipeline.__call__.output_type",description:`<strong>output_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"pt"</code>) — | |
| Output format. <code>"pt"</code> for PyTorch tensor, <code>"np"</code> for NumPy array, <code>"latent"</code> for raw latents.`,name:"output_type"},{anchor:"diffusers.AceStepPipeline.__call__.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) — | |
| Whether to return an <code>AudioPipelineOutput</code> or a plain tuple.`,name:"return_dict"},{anchor:"diffusers.AceStepPipeline.__call__.callback",description:`<strong>callback</strong> (<code>Callable</code>, <em>optional</em>) — | |
| A function called every <code>callback_steps</code> steps with <code>(step, timestep, latents)</code>.`,name:"callback"},{anchor:"diffusers.AceStepPipeline.__call__.callback_steps",description:`<strong>callback_steps</strong> (<code>int</code>, <em>optional</em>, defaults to 1) — | |
| Frequency of the callback function.`,name:"callback_steps"},{anchor:"diffusers.AceStepPipeline.__call__.callback_on_step_end",description:`<strong>callback_on_step_end</strong> (<code>Callable</code>, <em>optional</em>) — | |
| A function that is called at the end of each denoising step during inference. The function is called | |
| with the following arguments: <code>callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)</code>. <code>callback_kwargs</code> will include a list of all tensors as specified by | |
| <code>callback_on_step_end_tensor_inputs</code>.`,name:"callback_on_step_end"},{anchor:"diffusers.AceStepPipeline.__call__.callback_on_step_end_tensor_inputs",description:`<strong>callback_on_step_end_tensor_inputs</strong> (<code>List</code>, <em>optional</em>) — | |
| The list of tensor inputs for the <code>callback_on_step_end</code> function. The tensors specified in the list | |
| will be passed as <code>callback_kwargs</code> argument. You will only be able to include variables listed in the | |
| <code>._callback_tensor_inputs</code> attribute of your pipeline class.`,name:"callback_on_step_end_tensor_inputs"},{anchor:"diffusers.AceStepPipeline.__call__.instruction",description:`<strong>instruction</strong> (<code>str</code>, <em>optional</em>) — | |
| Custom instruction text for the generation task. If not provided, it is auto-generated based on | |
| <code>task_type</code>.`,name:"instruction"},{anchor:"diffusers.AceStepPipeline.__call__.max_text_length",description:`<strong>max_text_length</strong> (<code>int</code>, <em>optional</em>, defaults to 256) — | |
| Maximum token length for text prompt encoding.`,name:"max_text_length"},{anchor:"diffusers.AceStepPipeline.__call__.max_lyric_length",description:`<strong>max_lyric_length</strong> (<code>int</code>, <em>optional</em>, defaults to 2048) — | |
| Maximum token length for lyrics encoding.`,name:"max_lyric_length"},{anchor:"diffusers.AceStepPipeline.__call__.bpm",description:`<strong>bpm</strong> (<code>int</code>, <em>optional</em>) — | |
| BPM (beats per minute) for music metadata. If <code>None</code>, the model estimates it.`,name:"bpm"},{anchor:"diffusers.AceStepPipeline.__call__.keyscale",description:`<strong>keyscale</strong> (<code>str</code>, <em>optional</em>) — | |
| Musical key (e.g., <code>"C major"</code>, <code>"A minor"</code>). If <code>None</code>, the model estimates it.`,name:"keyscale"},{anchor:"diffusers.AceStepPipeline.__call__.timesignature",description:`<strong>timesignature</strong> (<code>str</code>, <em>optional</em>) — | |
| Time signature (e.g., <code>"4"</code> for 4/4, <code>"3"</code> for 3/4). If <code>None</code>, the model estimates it.`,name:"timesignature"},{anchor:"diffusers.AceStepPipeline.__call__.task_type",description:`<strong>task_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"text2music"</code>) — | |
| The generation task type. One of <code>"text2music"</code>, <code>"cover"</code>, <code>"repaint"</code>, <code>"extract"</code>, <code>"lego"</code>, | |
| <code>"complete"</code>.`,name:"task_type"},{anchor:"diffusers.AceStepPipeline.__call__.track_name",description:`<strong>track_name</strong> (<code>str</code>, <em>optional</em>) — | |
| Track name for <code>"extract"</code> or <code>"lego"</code> tasks (e.g., <code>"vocals"</code>, <code>"drums"</code>).`,name:"track_name"},{anchor:"diffusers.AceStepPipeline.__call__.complete_track_classes",description:`<strong>complete_track_classes</strong> (<code>List[str]</code>, <em>optional</em>) — | |
| Track classes for the <code>"complete"</code> task.`,name:"complete_track_classes"},{anchor:"diffusers.AceStepPipeline.__call__.src_audio",description:`<strong>src_audio</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Source audio tensor of shape <code>[channels, samples]</code> at 48kHz for audio-to-audio tasks (repaint, lego, | |
| cover, extract, complete). The audio is encoded through the VAE to produce source latents.`,name:"src_audio"},{anchor:"diffusers.AceStepPipeline.__call__.reference_audio",description:`<strong>reference_audio</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Reference audio tensor of shape <code>[channels, samples]</code> at 48kHz for timbre conditioning. Used to extract | |
| timbre features for style transfer.`,name:"reference_audio"},{anchor:"diffusers.AceStepPipeline.__call__.audio_codes",description:`<strong>audio_codes</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) — | |
| Audio semantic code strings (e.g. <code>"<|audio_code_123|><|audio_code_456|>..."</code>). When provided, the task | |
| is automatically switched to <code>"cover"</code> mode and the registered ACE-Step audio tokenizer / detokenizer | |
| modules decode the 5 Hz codes into 25 Hz acoustic conditioning.`,name:"audio_codes"},{anchor:"diffusers.AceStepPipeline.__call__.repainting_start",description:`<strong>repainting_start</strong> (<code>float</code>, <em>optional</em>) — | |
| Start time in seconds for the repaint region (for <code>"repaint"</code> and <code>"lego"</code> tasks).`,name:"repainting_start"},{anchor:"diffusers.AceStepPipeline.__call__.repainting_end",description:`<strong>repainting_end</strong> (<code>float</code>, <em>optional</em>) — | |
| End time in seconds for the repaint region. Use <code>-1</code> or <code>None</code> for until end.`,name:"repainting_end"},{anchor:"diffusers.AceStepPipeline.__call__.audio_cover_strength",description:`<strong>audio_cover_strength</strong> (<code>float</code>, <em>optional</em>, defaults to 1.0) — | |
| Strength of audio cover blending (0.0 to 1.0). When < 1.0, blends cover-conditioned and | |
| text-only-conditioned outputs. Lower values produce more style transfer effect.`,name:"audio_cover_strength"},{anchor:"diffusers.AceStepPipeline.__call__.cfg_interval_start",description:`<strong>cfg_interval_start</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) — | |
| Start ratio (0.0-1.0) of the timestep range where CFG is applied.`,name:"cfg_interval_start"},{anchor:"diffusers.AceStepPipeline.__call__.cfg_interval_end",description:`<strong>cfg_interval_end</strong> (<code>float</code>, <em>optional</em>, defaults to 1.0) — | |
| End ratio (0.0-1.0) of the timestep range where CFG is applied.`,name:"cfg_interval_end"},{anchor:"diffusers.AceStepPipeline.__call__.timesteps",description:`<strong>timesteps</strong> (<code>List[float]</code>, <em>optional</em>) — | |
| Custom timestep schedule. If provided, overrides <code>num_inference_steps</code> and <code>shift</code>.`,name:"timesteps"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L779",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p>If <code>return_dict</code> is <code>True</code>, an <code>AudioPipelineOutput</code> is returned, otherwise a tuple with the generated | |
| audio.</p> | |
| `,returnType:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p><a | |
| href="/docs/diffusers/pr_13745/en/api/pipelines/audioldm2#diffusers.AudioPipelineOutput" | |
| >AudioPipelineOutput</a> or <code>tuple</code></p> | |
| `}}),k=new zt({props:{anchor:"diffusers.AceStepPipeline.__call__.example",$$slots:{default:[Lt]},$$scope:{ctx:Me}}}),Y=new oe({props:{name:"check_inputs",anchor:"diffusers.AceStepPipeline.check_inputs",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]]"},{name:"lyrics",val:": typing.Union[str, typing.List[str]]"},{name:"task_type",val:": str"},{name:"num_inference_steps",val:": int"},{name:"guidance_scale",val:": float"},{name:"shift",val:": float"},{name:"audio_cover_strength",val:": float"},{name:"cfg_interval_start",val:": float"},{name:"cfg_interval_end",val:": float"},{name:"repainting_start",val:": typing.Optional[float]"},{name:"repainting_end",val:": typing.Optional[float]"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L227"}}),O=new oe({props:{name:"encode_prompt",anchor:"diffusers.AceStepPipeline.encode_prompt",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]]"},{name:"lyrics",val:": typing.Union[str, typing.List[str]]"},{name:"device",val:": device"},{name:"vocal_language",val:": typing.Union[str, typing.List[str]] = 'en'"},{name:"audio_duration",val:": float = 60.0"},{name:"instruction",val:": typing.Optional[str] = None"},{name:"bpm",val:": typing.Optional[int] = None"},{name:"keyscale",val:": typing.Optional[str] = None"},{name:"timesignature",val:": typing.Optional[str] = None"},{name:"max_text_length",val:": int = 256"},{name:"max_lyric_length",val:": int = 2048"}],parametersDescription:[{anchor:"diffusers.AceStepPipeline.encode_prompt.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>) — | |
| Text caption(s) describing the music.`,name:"prompt"},{anchor:"diffusers.AceStepPipeline.encode_prompt.lyrics",description:`<strong>lyrics</strong> (<code>str</code> or <code>List[str]</code>) — | |
| Lyric text(s).`,name:"lyrics"},{anchor:"diffusers.AceStepPipeline.encode_prompt.device",description:`<strong>device</strong> (<code>torch.device</code>) — | |
| Device for tensors.`,name:"device"},{anchor:"diffusers.AceStepPipeline.encode_prompt.vocal_language",description:`<strong>vocal_language</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>, defaults to <code>"en"</code>) — | |
| Language code(s) for lyrics.`,name:"vocal_language"},{anchor:"diffusers.AceStepPipeline.encode_prompt.audio_duration",description:`<strong>audio_duration</strong> (<code>float</code>, <em>optional</em>, defaults to 60.0) — | |
| Duration of the audio in seconds.`,name:"audio_duration"},{anchor:"diffusers.AceStepPipeline.encode_prompt.instruction",description:`<strong>instruction</strong> (<code>str</code>, <em>optional</em>) — | |
| Instruction text for generation.`,name:"instruction"},{anchor:"diffusers.AceStepPipeline.encode_prompt.bpm",description:`<strong>bpm</strong> (<code>int</code>, <em>optional</em>) — | |
| BPM (beats per minute) for metadata.`,name:"bpm"},{anchor:"diffusers.AceStepPipeline.encode_prompt.keyscale",description:`<strong>keyscale</strong> (<code>str</code>, <em>optional</em>) — | |
| Musical key (e.g., <code>"C major"</code>).`,name:"keyscale"},{anchor:"diffusers.AceStepPipeline.encode_prompt.timesignature",description:`<strong>timesignature</strong> (<code>str</code>, <em>optional</em>) — | |
| Time signature (e.g., <code>"4"</code> for 4/4).`,name:"timesignature"},{anchor:"diffusers.AceStepPipeline.encode_prompt.max_text_length",description:`<strong>max_text_length</strong> (<code>int</code>, <em>optional</em>, defaults to 256) — | |
| Maximum token length for text prompts.`,name:"max_text_length"},{anchor:"diffusers.AceStepPipeline.encode_prompt.max_lyric_length",description:`<strong>max_lyric_length</strong> (<code>int</code>, <em>optional</em>, defaults to 2048) — | |
| Maximum token length for lyrics.`,name:"max_lyric_length"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L396",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p>Tuple of <code>(text_hidden_states, text_attention_mask, lyric_hidden_states, lyric_attention_mask)</code>.</p> | |
| `}}),K=new oe({props:{name:"prepare_latents",anchor:"diffusers.AceStepPipeline.prepare_latents",parameters:[{name:"batch_size",val:": int"},{name:"audio_duration",val:": float"},{name:"dtype",val:": dtype"},{name:"device",val:": device"},{name:"generator",val:": typing.Union[torch._C.Generator, typing.List[torch._C.Generator], NoneType] = None"},{name:"latents",val:": typing.Optional[torch.Tensor] = None"}],parametersDescription:[{anchor:"diffusers.AceStepPipeline.prepare_latents.batch_size",description:"<strong>batch_size</strong> (<code>int</code>) — Number of samples to generate.",name:"batch_size"},{anchor:"diffusers.AceStepPipeline.prepare_latents.audio_duration",description:"<strong>audio_duration</strong> (<code>float</code>) — Duration of audio in seconds.",name:"audio_duration"},{anchor:"diffusers.AceStepPipeline.prepare_latents.dtype",description:"<strong>dtype</strong> (<code>torch.dtype</code>) — Data type for the latents.",name:"dtype"},{anchor:"diffusers.AceStepPipeline.prepare_latents.device",description:"<strong>device</strong> (<code>torch.device</code>) — Device for the latents.",name:"device"},{anchor:"diffusers.AceStepPipeline.prepare_latents.generator",description:"<strong>generator</strong> (<code>torch.Generator</code> or <code>List[torch.Generator]</code>, <em>optional</em>) — Random number generator(s).",name:"generator"},{anchor:"diffusers.AceStepPipeline.prepare_latents.latents",description:"<strong>latents</strong> (<code>torch.Tensor</code>, <em>optional</em>) — Pre-generated latents.",name:"latents"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L501",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p>Noise latents of shape <code>(batch_size, latent_length, acoustic_dim)</code>.</p> | |
| `}}),ee=new oe({props:{name:"prepare_reference_audio_latents",anchor:"diffusers.AceStepPipeline.prepare_reference_audio_latents",parameters:[{name:"reference_audio",val:": Tensor"},{name:"batch_size",val:": int"},{name:"device",val:": device"},{name:"dtype",val:": dtype"}],parametersDescription:[{anchor:"diffusers.AceStepPipeline.prepare_reference_audio_latents.reference_audio",description:`<strong>reference_audio</strong> (<code>torch.Tensor</code>) — Reference audio tensor of shape <code>[channels, samples]</code> at | |
| <code>self.sample_rate</code>.`,name:"reference_audio"},{anchor:"diffusers.AceStepPipeline.prepare_reference_audio_latents.batch_size",description:"<strong>batch_size</strong> (<code>int</code>) — Batch size.",name:"batch_size"},{anchor:"diffusers.AceStepPipeline.prepare_reference_audio_latents.device",description:"<strong>device</strong> (<code>torch.device</code>) — Target device.",name:"device"},{anchor:"diffusers.AceStepPipeline.prepare_reference_audio_latents.dtype",description:"<strong>dtype</strong> (<code>torch.dtype</code>) — Target dtype.",name:"dtype"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L575",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p>Tuple of <code>(refer_audio_acoustic, refer_audio_order_mask)</code>.</p> | |
| `}}),te=new oe({props:{name:"prepare_src_latents",anchor:"diffusers.AceStepPipeline.prepare_src_latents",parameters:[{name:"device",val:": device"},{name:"dtype",val:": dtype"},{name:"batch_size",val:": int = 1"},{name:"src_audio",val:": typing.Optional[torch.Tensor] = None"},{name:"audio_codes",val:": typing.Union[str, typing.List[str], NoneType] = None"},{name:"latent_length",val:": typing.Optional[int] = None"},{name:"task_type",val:": str = 'text2music'"}],parametersDescription:[{anchor:"diffusers.AceStepPipeline.prepare_src_latents.src_audio",description:`<strong>src_audio</strong> (<code>torch.Tensor</code>, <em>optional</em>) — Source audio tensor of shape <code>[channels, samples]</code> at | |
| <code>self.sample_rate</code>.`,name:"src_audio"},{anchor:"diffusers.AceStepPipeline.prepare_src_latents.audio_codes",description:"<strong>audio_codes</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) — Audio semantic code strings.",name:"audio_codes"},{anchor:"diffusers.AceStepPipeline.prepare_src_latents.latent_length",description:"<strong>latent_length</strong> (<code>int</code>, <em>optional</em>) — Target latent length when no source audio or audio codes are given.",name:"latent_length"},{anchor:"diffusers.AceStepPipeline.prepare_src_latents.device",description:"<strong>device</strong> (<code>torch.device</code>) — Target device.",name:"device"},{anchor:"diffusers.AceStepPipeline.prepare_src_latents.dtype",description:"<strong>dtype</strong> (<code>torch.dtype</code>) — Target dtype.",name:"dtype"},{anchor:"diffusers.AceStepPipeline.prepare_src_latents.batch_size",description:"<strong>batch_size</strong> (<code>int</code>) — Batch size.",name:"batch_size"},{anchor:"diffusers.AceStepPipeline.prepare_src_latents.task_type",description:"<strong>task_type</strong> (<code>str</code>) — Current task type.",name:"task_type"}],source:"https://github.com/huggingface/diffusers/blob/vr_13745/src/diffusers/pipelines/ace_step/pipeline_ace_step.py#L628",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p>Tuple of <code>(src_latents, latent_length)</code> where <code>src_latents</code> has shape <code>[batch, T, D]</code>.</p> | |
| `}}),ne=new Dt({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/ace_step.md"}}),{c(){m=r("meta"),P=o(),T=r("p"),b=o(),f(M.$$.fragment),u=o(),f(x.$$.fragment),xe=o(),q=r("p"),q.innerHTML=pt,Te=o(),G=r("p"),G.innerHTML=ut,we=o(),I=r("p"),I.textContent=mt,Ue=o(),B=r("p"),B.innerHTML=ft,Ce=o(),f(E.$$.fragment),Je=o(),Z=r("p"),Z.textContent=gt,Se=o(),N=r("table"),N.innerHTML=ht,ke=o(),V=r("p"),V.innerHTML=_t,je=o(),f(Q.$$.fragment),Ae=o(),z=r("p"),z.textContent=yt,$e=o(),D=r("ul"),D.innerHTML=vt,Pe=o(),L=r("p"),L.textContent=bt,qe=o(),H=r("ul"),H.innerHTML=Mt,Ge=o(),f(W.$$.fragment),Ie=o(),f(R.$$.fragment),Be=o(),l=r("div"),f(F.$$.fragment),Le=o(),se=r("p"),se.textContent=xt,He=o(),ae=r("p"),ae.innerHTML=Tt,We=o(),ie=r("p"),ie.textContent=wt,Re=o(),re=r("p"),re.textContent=Ut,Fe=o(),ce=r("ul"),ce.innerHTML=Ct,Xe=o(),w=r("div"),f(X.$$.fragment),Ye=o(),le=r("p"),le.textContent=Jt,Oe=o(),f(k.$$.fragment),Ke=o(),j=r("div"),f(Y.$$.fragment),et=o(),de=r("p"),de.textContent=St,tt=o(),U=r("div"),f(O.$$.fragment),nt=o(),pe=r("p"),pe.textContent=kt,ot=o(),ue=r("p"),ue.textContent=jt,st=o(),A=r("div"),f(K.$$.fragment),at=o(),me=r("p"),me.textContent=At,it=o(),C=r("div"),f(ee.$$.fragment),rt=o(),fe=r("p"),fe.textContent=$t,ct=o(),ge=r("p"),ge.textContent=Pt,lt=o(),$=r("div"),f(te.$$.fragment),dt=o(),he=r("p"),he.textContent=qt,Ee=o(),f(ne.$$.fragment),Ze=o(),be=r("p"),this.h()},l(e){const t=Vt("svelte-u9bgzb",document.head);m=c(t,"META",{name:!0,content:!0}),t.forEach(n),P=s(e),T=c(e,"P",{}),J(T).forEach(n),b=s(e),g(M.$$.fragment,e),u=s(e),g(x.$$.fragment,e),xe=s(e),q=c(e,"P",{"data-svelte-h":!0}),p(q)!=="svelte-1xmbfz5"&&(q.innerHTML=pt),Te=s(e),G=c(e,"P",{"data-svelte-h":!0}),p(G)!=="svelte-19liucp"&&(G.innerHTML=ut),we=s(e),I=c(e,"P",{"data-svelte-h":!0}),p(I)!=="svelte-1jpmgzg"&&(I.textContent=mt),Ue=s(e),B=c(e,"P",{"data-svelte-h":!0}),p(B)!=="svelte-nix58r"&&(B.innerHTML=ft),Ce=s(e),g(E.$$.fragment,e),Je=s(e),Z=c(e,"P",{"data-svelte-h":!0}),p(Z)!=="svelte-1n2ep4u"&&(Z.textContent=gt),Se=s(e),N=c(e,"TABLE",{"data-svelte-h":!0}),p(N)!=="svelte-234pxd"&&(N.innerHTML=ht),ke=s(e),V=c(e,"P",{"data-svelte-h":!0}),p(V)!=="svelte-14cgle6"&&(V.innerHTML=_t),je=s(e),g(Q.$$.fragment,e),Ae=s(e),z=c(e,"P",{"data-svelte-h":!0}),p(z)!=="svelte-1dvtu0c"&&(z.textContent=yt),$e=s(e),D=c(e,"UL",{"data-svelte-h":!0}),p(D)!=="svelte-igfv1y"&&(D.innerHTML=vt),Pe=s(e),L=c(e,"P",{"data-svelte-h":!0}),p(L)!=="svelte-1g0t9wk"&&(L.textContent=bt),qe=s(e),H=c(e,"UL",{"data-svelte-h":!0}),p(H)!=="svelte-zawy12"&&(H.innerHTML=Mt),Ge=s(e),g(W.$$.fragment,e),Ie=s(e),g(R.$$.fragment,e),Be=s(e),l=c(e,"DIV",{class:!0});var d=J(l);g(F.$$.fragment,d),Le=s(d),se=c(d,"P",{"data-svelte-h":!0}),p(se)!=="svelte-19idt02"&&(se.textContent=xt),He=s(d),ae=c(d,"P",{"data-svelte-h":!0}),p(ae)!=="svelte-k5j8ly"&&(ae.innerHTML=Tt),We=s(d),ie=c(d,"P",{"data-svelte-h":!0}),p(ie)!=="svelte-xc6yjp"&&(ie.textContent=wt),Re=s(d),re=c(d,"P",{"data-svelte-h":!0}),p(re)!=="svelte-1w6yaag"&&(re.textContent=Ut),Fe=s(d),ce=c(d,"UL",{"data-svelte-h":!0}),p(ce)!=="svelte-1piw6h1"&&(ce.innerHTML=Ct),Xe=s(d),w=c(d,"DIV",{class:!0});var _e=J(w);g(X.$$.fragment,_e),Ye=s(_e),le=c(_e,"P",{"data-svelte-h":!0}),p(le)!=="svelte-1liw69f"&&(le.textContent=Jt),Oe=s(_e),g(k.$$.fragment,_e),_e.forEach(n),Ke=s(d),j=c(d,"DIV",{class:!0});var Ve=J(j);g(Y.$$.fragment,Ve),et=s(Ve),de=c(Ve,"P",{"data-svelte-h":!0}),p(de)!=="svelte-1i5h6w3"&&(de.textContent=St),Ve.forEach(n),tt=s(d),U=c(d,"DIV",{class:!0});var ye=J(U);g(O.$$.fragment,ye),nt=s(ye),pe=c(ye,"P",{"data-svelte-h":!0}),p(pe)!=="svelte-14s2s2t"&&(pe.textContent=kt),ot=s(ye),ue=c(ye,"P",{"data-svelte-h":!0}),p(ue)!=="svelte-l7jx7"&&(ue.textContent=jt),ye.forEach(n),st=s(d),A=c(d,"DIV",{class:!0});var Qe=J(A);g(K.$$.fragment,Qe),at=s(Qe),me=c(Qe,"P",{"data-svelte-h":!0}),p(me)!=="svelte-1kcgwsa"&&(me.textContent=At),Qe.forEach(n),it=s(d),C=c(d,"DIV",{class:!0});var ve=J(C);g(ee.$$.fragment,ve),rt=s(ve),fe=c(ve,"P",{"data-svelte-h":!0}),p(fe)!=="svelte-583sdx"&&(fe.textContent=$t),ct=s(ve),ge=c(ve,"P",{"data-svelte-h":!0}),p(ge)!=="svelte-17nvcr3"&&(ge.textContent=Pt),ve.forEach(n),lt=s(d),$=c(d,"DIV",{class:!0});var ze=J($);g(te.$$.fragment,ze),dt=s(ze),he=c(ze,"P",{"data-svelte-h":!0}),p(he)!=="svelte-wb5sxz"&&(he.textContent=qt),ze.forEach(n),d.forEach(n),Ee=s(e),g(ne.$$.fragment,e),Ze=s(e),be=c(e,"P",{}),J(be).forEach(n),this.h()},h(){S(m,"name","hf:doc:metadata"),S(m,"content",Wt),S(w,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),S(j,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),S(U,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),S(A,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),S(C,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),S($,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),S(l,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,t){a(document.head,m),i(e,P,t),i(e,T,t),i(e,b,t),h(M,e,t),i(e,u,t),h(x,e,t),i(e,xe,t),i(e,q,t),i(e,Te,t),i(e,G,t),i(e,we,t),i(e,I,t),i(e,Ue,t),i(e,B,t),i(e,Ce,t),h(E,e,t),i(e,Je,t),i(e,Z,t),i(e,Se,t),i(e,N,t),i(e,ke,t),i(e,V,t),i(e,je,t),h(Q,e,t),i(e,Ae,t),i(e,z,t),i(e,$e,t),i(e,D,t),i(e,Pe,t),i(e,L,t),i(e,qe,t),i(e,H,t),i(e,Ge,t),h(W,e,t),i(e,Ie,t),h(R,e,t),i(e,Be,t),i(e,l,t),h(F,l,null),a(l,Le),a(l,se),a(l,He),a(l,ae),a(l,We),a(l,ie),a(l,Re),a(l,re),a(l,Fe),a(l,ce),a(l,Xe),a(l,w),h(X,w,null),a(w,Ye),a(w,le),a(w,Oe),h(k,w,null),a(l,Ke),a(l,j),h(Y,j,null),a(j,et),a(j,de),a(l,tt),a(l,U),h(O,U,null),a(U,nt),a(U,pe),a(U,ot),a(U,ue),a(l,st),a(l,A),h(K,A,null),a(A,at),a(A,me),a(l,it),a(l,C),h(ee,C,null),a(C,rt),a(C,fe),a(C,ct),a(C,ge),a(l,lt),a(l,$),h(te,$,null),a($,dt),a($,he),i(e,Ee,t),h(ne,e,t),i(e,Ze,t),i(e,be,t),Ne=!0},p(e,[t]){const d={};t&2&&(d.$$scope={dirty:t,ctx:e}),k.$set(d)},i(e){Ne||(_(M.$$.fragment,e),_(x.$$.fragment,e),_(E.$$.fragment,e),_(Q.$$.fragment,e),_(W.$$.fragment,e),_(R.$$.fragment,e),_(F.$$.fragment,e),_(X.$$.fragment,e),_(k.$$.fragment,e),_(Y.$$.fragment,e),_(O.$$.fragment,e),_(K.$$.fragment,e),_(ee.$$.fragment,e),_(te.$$.fragment,e),_(ne.$$.fragment,e),Ne=!0)},o(e){y(M.$$.fragment,e),y(x.$$.fragment,e),y(E.$$.fragment,e),y(Q.$$.fragment,e),y(W.$$.fragment,e),y(R.$$.fragment,e),y(F.$$.fragment,e),y(X.$$.fragment,e),y(k.$$.fragment,e),y(Y.$$.fragment,e),y(O.$$.fragment,e),y(K.$$.fragment,e),y(ee.$$.fragment,e),y(te.$$.fragment,e),y(ne.$$.fragment,e),Ne=!1},d(e){e&&(n(P),n(T),n(b),n(u),n(xe),n(q),n(Te),n(G),n(we),n(I),n(Ue),n(B),n(Ce),n(Je),n(Z),n(Se),n(N),n(ke),n(V),n(je),n(Ae),n(z),n($e),n(D),n(Pe),n(L),n(qe),n(H),n(Ge),n(Ie),n(Be),n(l),n(Ee),n(Ze),n(be)),n(m),v(M,e),v(x,e),v(E,e),v(Q,e),v(W,e),v(R,e),v(F),v(X),v(k),v(Y),v(O),v(K),v(ee),v(te),v(ne,e)}}}const Wt='{"title":"ACE-Step 1.5","local":"ace-step-15","sections":[{"title":"Variants","local":"variants","sections":[],"depth":2},{"title":"Tips","local":"tips","sections":[],"depth":2},{"title":"AceStepPipeline","local":"diffusers.AceStepPipeline","sections":[],"depth":2}],"depth":1}';function Rt(Me){return Bt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class nn extends Zt{constructor(m){super(),Nt(this,m,Rt,Ht,It,{})}}export{nn as component}; | |
Xet Storage Details
- Size:
- 50.7 kB
- Xet hash:
- 114c5917e89d99a9e16ab3aeb03964e19528c7f4d0b81210b8278624868a664b
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.