Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / diffusers /pr_11636 /en /_app /immutable /nodes /109.faacfa06.js

rtrm's picture

29 days ago

82.1 kB

	import{s as jo,o as Do,n as Ft}from"../chunks/scheduler.53228c21.js";import{S as ko,i as Ao,e as r,s as n,c,h as Uo,a as d,d as o,b as s,f as D,g as m,j as _,k as j,l as i,m as a,n as u,t as f,o as g,p as h}from"../chunks/index.100fac89.js";import{C as Lo}from"../chunks/CopyLLMTxtMenu.8a16ebe2.js";import{D as U}from"../chunks/Docstring.07ca7ce7.js";import{C as Rt}from"../chunks/CodeBlock.d30a6509.js";import{E as zt}from"../chunks/ExampleCodeBlock.672157f9.js";import{H as N,E as $o}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.83a5c0e1.js";function Co(B){let p,T="Examples:",M,y,w;return y=new Rt({props:{code:"aW1wb3J0JTIwc2NpcHklMEFpbXBvcnQlMjB0b3JjaCUwQWZyb20lMjBkaWZmdXNlcnMlMjBpbXBvcnQlMjBBdWRpb0xETTJQaXBlbGluZSUwQSUwQXJlcG9faWQlMjAlM0QlMjAlMjJjdnNzcCUyRmF1ZGlvbGRtMiUyMiUwQXBpcGUlMjAlM0QlMjBBdWRpb0xETTJQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQocmVwb19pZCUyQyUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiklMEFwaXBlJTIwJTNEJTIwcGlwZS50byglMjJjdWRhJTIyKSUwQSUwQSUyMyUyMGRlZmluZSUyMHRoZSUyMHByb21wdHMlMEFwcm9tcHQlMjAlM0QlMjAlMjJUaGUlMjBzb3VuZCUyMG9mJTIwYSUyMGhhbW1lciUyMGhpdHRpbmclMjBhJTIwd29vZGVuJTIwc3VyZmFjZS4lMjIlMEFuZWdhdGl2ZV9wcm9tcHQlMjAlM0QlMjAlMjJMb3clMjBxdWFsaXR5LiUyMiUwQSUwQSUyMyUyMHNldCUyMHRoZSUyMHNlZWQlMjBmb3IlMjBnZW5lcmF0b3IlMEFnZW5lcmF0b3IlMjAlM0QlMjB0b3JjaC5HZW5lcmF0b3IoJTIyY3VkYSUyMikubWFudWFsX3NlZWQoMCklMEElMEElMjMlMjBydW4lMjB0aGUlMjBnZW5lcmF0aW9uJTBBYXVkaW8lMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMHByb21wdCUyQyUwQSUyMCUyMCUyMCUyMG5lZ2F0aXZlX3Byb21wdCUzRG5lZ2F0aXZlX3Byb21wdCUyQyUwQSUyMCUyMCUyMCUyMG51bV9pbmZlcmVuY2Vfc3RlcHMlM0QyMDAlMkMlMEElMjAlMjAlMjAlMjBhdWRpb19sZW5ndGhfaW5fcyUzRDEwLjAlMkMlMEElMjAlMjAlMjAlMjBudW1fd2F2ZWZvcm1zX3Blcl9wcm9tcHQlM0QzJTJDJTBBJTIwJTIwJTIwJTIwZ2VuZXJhdG9yJTNEZ2VuZXJhdG9yJTJDJTBBKS5hdWRpb3MlMEElMEElMjMlMjBzYXZlJTIwdGhlJTIwYmVzdCUyMGF1ZGlvJTIwc2FtcGxlJTIwKGluZGV4JTIwMCklMjBhcyUyMGElMjAud2F2JTIwZmlsZSUwQXNjaXB5LmlvLndhdmZpbGUud3JpdGUoJTIydGVjaG5vLndhdiUyMiUyQyUyMHJhdGUlM0QxNjAwMCUyQyUyMGRhdGElM0RhdWRpbyU1QjAlNUQp",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> scipy
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AudioLDM2Pipeline

	<span class="hljs-meta">>>> </span>repo_id = <span class="hljs-string">"cvssp/audioldm2"</span>
	<span class="hljs-meta">>>> </span>pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
	<span class="hljs-meta">>>> </span>pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># define the prompts</span>
	<span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"The sound of a hammer hitting a wooden surface."</span>
	<span class="hljs-meta">>>> </span>negative_prompt = <span class="hljs-string">"Low quality."</span>

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># set the seed for generator</span>
	<span class="hljs-meta">>>> </span>generator = torch.Generator(<span class="hljs-string">"cuda"</span>).manual_seed(<span class="hljs-number">0</span>)

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># run the generation</span>
	<span class="hljs-meta">>>> </span>audio = pipe(
	<span class="hljs-meta">... </span> prompt,
	<span class="hljs-meta">... </span> negative_prompt=negative_prompt,
	<span class="hljs-meta">... </span> num_inference_steps=<span class="hljs-number">200</span>,
	<span class="hljs-meta">... </span> audio_length_in_s=<span class="hljs-number">10.0</span>,
	<span class="hljs-meta">... </span> num_waveforms_per_prompt=<span class="hljs-number">3</span>,
	<span class="hljs-meta">... </span> generator=generator,
	<span class="hljs-meta">... </span>).audios

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># save the best audio sample (index 0) as a .wav file</span>
	<span class="hljs-meta">>>> </span>scipy.io.wavfile.write(<span class="hljs-string">"techno.wav"</span>, rate=<span class="hljs-number">16000</span>, data=audio[<span class="hljs-number">0</span>])`,wrap:!1}}),{c(){p=r("p"),p.textContent=T,M=n(),c(y.$$.fragment)},l(l){p=d(l,"P",{"data-svelte-h":!0}),_(p)!=="svelte-kvfsh7"&&(p.textContent=T),M=s(l),m(y.$$.fragment,l)},m(l,x){a(l,p,x),a(l,M,x),u(y,l,x),w=!0},p:Ft,i(l){w\|\|(f(y.$$.fragment,l),w=!0)},o(l){g(y.$$.fragment,l),w=!1},d(l){l&&(o(p),o(M)),h(y,l)}}}function No(B){let p,T;return p=new Rt({props:{code:"aW1wb3J0JTIwc2NpcHklMEFpbXBvcnQlMjB0b3JjaCUwQWZyb20lMjBkaWZmdXNlcnMlMjBpbXBvcnQlMjBBdWRpb0xETTJQaXBlbGluZSUwQSUwQXJlcG9faWQlMjAlM0QlMjAlMjJhbmhuY3QlMkZhdWRpb2xkbTJfZ2lnYXNwZWVjaCUyMiUwQXBpcGUlMjAlM0QlMjBBdWRpb0xETTJQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQocmVwb19pZCUyQyUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiklMEFwaXBlJTIwJTNEJTIwcGlwZS50byglMjJjdWRhJTIyKSUwQSUwQSUyMyUyMGRlZmluZSUyMHRoZSUyMHByb21wdHMlMEFwcm9tcHQlMjAlM0QlMjAlMjJBJTIwZmVtYWxlJTIwcmVwb3J0ZXIlMjBpcyUyMHNwZWFraW5nJTIyJTBBdHJhbnNjcmlwdCUyMCUzRCUyMCUyMndpc2glMjB5b3UlMjBoYXZlJTIwYSUyMGdvb2QlMjBkYXklMjIlMEElMEElMjMlMjBzZXQlMjB0aGUlMjBzZWVkJTIwZm9yJTIwZ2VuZXJhdG9yJTBBZ2VuZXJhdG9yJTIwJTNEJTIwdG9yY2guR2VuZXJhdG9yKCUyMmN1ZGElMjIpLm1hbnVhbF9zZWVkKDApJTBBJTBBJTIzJTIwcnVuJTIwdGhlJTIwZ2VuZXJhdGlvbiUwQWF1ZGlvJTIwJTNEJTIwcGlwZSglMEElMjAlMjAlMjAlMjBwcm9tcHQlMkMlMEElMjAlMjAlMjAlMjB0cmFuc2NyaXB0aW9uJTNEdHJhbnNjcmlwdCUyQyUwQSUyMCUyMCUyMCUyMG51bV9pbmZlcmVuY2Vfc3RlcHMlM0QyMDAlMkMlMEElMjAlMjAlMjAlMjBhdWRpb19sZW5ndGhfaW5fcyUzRDEwLjAlMkMlMEElMjAlMjAlMjAlMjBudW1fd2F2ZWZvcm1zX3Blcl9wcm9tcHQlM0QyJTJDJTBBJTIwJTIwJTIwJTIwZ2VuZXJhdG9yJTNEZ2VuZXJhdG9yJTJDJTBBJTIwJTIwJTIwJTIwbWF4X25ld190b2tlbnMlM0Q1MTIlMkMlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjNNdXN0JTIwc2V0JTIwbWF4X25ld190b2tlbnMlMjBlcXVhJTIwdG8lMjA1MTIlMjBmb3IlMjBUVFMlMEEpLmF1ZGlvcyUwQSUwQSUyMyUyMHNhdmUlMjB0aGUlMjBiZXN0JTIwYXVkaW8lMjBzYW1wbGUlMjAoaW5kZXglMjAwKSUyMGFzJTIwYSUyMC53YXYlMjBmaWxlJTBBc2NpcHkuaW8ud2F2ZmlsZS53cml0ZSglMjJ0dHMud2F2JTIyJTJDJTIwcmF0ZSUzRDE2MDAwJTJDJTIwZGF0YSUzRGF1ZGlvJTVCMCU1RCk=",highlighted:`#Using AudioLDM2 for Text To Speech
	<span class="hljs-meta prompt_">>>></span> <span class="language-python"><span class="hljs-keyword">import</span> scipy</span>
	<span class="hljs-meta prompt_">>>></span> <span class="language-python"><span class="hljs-keyword">import</span> torch</span>
	<span class="hljs-meta prompt_">>>></span> <span class="language-python"><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AudioLDM2Pipeline</span>

	<span class="hljs-meta prompt_">>>></span> <span class="language-python">repo_id = <span class="hljs-string">"anhnct/audioldm2_gigaspeech"</span></span>
	<span class="hljs-meta prompt_">>>></span> <span class="language-python">pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)</span>
	<span class="hljs-meta prompt_">>>></span> <span class="language-python">pipe = pipe.to(<span class="hljs-string">"cuda"</span>)</span>

	<span class="hljs-meta prompt_">>>></span> <span class="language-python"><span class="hljs-comment"># define the prompts</span></span>
	<span class="hljs-meta prompt_">>>></span> <span class="language-python">prompt = <span class="hljs-string">"A female reporter is speaking"</span></span>
	<span class="hljs-meta prompt_">>>></span> <span class="language-python">transcript = <span class="hljs-string">"wish you have a good day"</span></span>

	<span class="hljs-meta prompt_">>>></span> <span class="language-python"><span class="hljs-comment"># set the seed for generator</span></span>
	<span class="hljs-meta prompt_">>>></span> <span class="language-python">generator = torch.Generator(<span class="hljs-string">"cuda"</span>).manual_seed(<span class="hljs-number">0</span>)</span>

	<span class="hljs-meta prompt_">>>></span> <span class="language-python"><span class="hljs-comment"># run the generation</span></span>
	<span class="hljs-meta prompt_">>>></span> <span class="language-python">audio = pipe(</span>
	<span class="hljs-meta prompt_">...</span> <span class="language-python"> prompt,</span>
	<span class="hljs-meta prompt_">...</span> <span class="language-python"> transcription=transcript,</span>
	<span class="hljs-meta prompt_">...</span> <span class="language-python"> num_inference_steps=<span class="hljs-number">200</span>,</span>
	<span class="hljs-meta prompt_">...</span> <span class="language-python"> audio_length_in_s=<span class="hljs-number">10.0</span>,</span>
	<span class="hljs-meta prompt_">...</span> <span class="language-python"> num_waveforms_per_prompt=<span class="hljs-number">2</span>,</span>
	<span class="hljs-meta prompt_">...</span> <span class="language-python"> generator=generator,</span>
	<span class="hljs-meta prompt_">...</span> <span class="language-python"> max_new_tokens=<span class="hljs-number">512</span>, <span class="hljs-comment">#Must set max_new_tokens equa to 512 for TTS</span></span>
	<span class="hljs-meta prompt_">...</span> <span class="language-python">).audios</span>

	<span class="hljs-meta prompt_">>>></span> <span class="language-python"><span class="hljs-comment"># save the best audio sample (index 0) as a .wav file</span></span>
	<span class="hljs-meta prompt_">>>></span> <span class="language-python">scipy.io.wavfile.write(<span class="hljs-string">"tts.wav"</span>, rate=<span class="hljs-number">16000</span>, data=audio[<span class="hljs-number">0</span>])</span>`,wrap:!1}}),{c(){c(p.$$.fragment)},l(M){m(p.$$.fragment,M)},m(M,y){u(p,M,y),T=!0},p:Ft,i(M){T\|\|(f(p.$$.fragment,M),T=!0)},o(M){g(p.$$.fragment,M),T=!1},d(M){h(p,M)}}}function Jo(B){let p,T="Example:",M,y,w;return y=new Rt({props:{code:"aW1wb3J0JTIwc2NpcHklMEFpbXBvcnQlMjB0b3JjaCUwQWZyb20lMjBkaWZmdXNlcnMlMjBpbXBvcnQlMjBBdWRpb0xETTJQaXBlbGluZSUwQSUwQXJlcG9faWQlMjAlM0QlMjAlMjJjdnNzcCUyRmF1ZGlvbGRtMiUyMiUwQXBpcGUlMjAlM0QlMjBBdWRpb0xETTJQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQocmVwb19pZCUyQyUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiklMEFwaXBlJTIwJTNEJTIwcGlwZS50byglMjJjdWRhJTIyKSUwQSUwQSUyMyUyMEdldCUyMHRleHQlMjBlbWJlZGRpbmclMjB2ZWN0b3JzJTBBcHJvbXB0X2VtYmVkcyUyQyUyMGF0dGVudGlvbl9tYXNrJTJDJTIwZ2VuZXJhdGVkX3Byb21wdF9lbWJlZHMlMjAlM0QlMjBwaXBlLmVuY29kZV9wcm9tcHQoJTBBJTIwJTIwJTIwJTIwcHJvbXB0JTNEJTIyVGVjaG5vJTIwbXVzaWMlMjB3aXRoJTIwYSUyMHN0cm9uZyUyQyUyMHVwYmVhdCUyMHRlbXBvJTIwYW5kJTIwaGlnaCUyMG1lbG9kaWMlMjByaWZmcyUyMiUyQyUwQSUyMCUyMCUyMCUyMGRldmljZSUzRCUyMmN1ZGElMjIlMkMlMEElMjAlMjAlMjAlMjBkb19jbGFzc2lmaWVyX2ZyZWVfZ3VpZGFuY2UlM0RUcnVlJTJDJTBBKSUwQSUwQSUyMyUyMFBhc3MlMjB0ZXh0JTIwZW1iZWRkaW5ncyUyMHRvJTIwcGlwZWxpbmUlMjBmb3IlMjB0ZXh0LWNvbmRpdGlvbmFsJTIwYXVkaW8lMjBnZW5lcmF0aW9uJTBBYXVkaW8lMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMHByb21wdF9lbWJlZHMlM0Rwcm9tcHRfZW1iZWRzJTJDJTBBJTIwJTIwJTIwJTIwYXR0ZW50aW9uX21hc2slM0RhdHRlbnRpb25fbWFzayUyQyUwQSUyMCUyMCUyMCUyMGdlbmVyYXRlZF9wcm9tcHRfZW1iZWRzJTNEZ2VuZXJhdGVkX3Byb21wdF9lbWJlZHMlMkMlMEElMjAlMjAlMjAlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMjAwJTJDJTBBJTIwJTIwJTIwJTIwYXVkaW9fbGVuZ3RoX2luX3MlM0QxMC4wJTJDJTBBKS5hdWRpb3MlNUIwJTVEJTBBJTBBJTIzJTIwc2F2ZSUyMGdlbmVyYXRlZCUyMGF1ZGlvJTIwc2FtcGxlJTBBc2NpcHkuaW8ud2F2ZmlsZS53cml0ZSglMjJ0ZWNobm8ud2F2JTIyJTJDJTIwcmF0ZSUzRDE2MDAwJTJDJTIwZGF0YSUzRGF1ZGlvKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> scipy
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AudioLDM2Pipeline

	<span class="hljs-meta">>>> </span>repo_id = <span class="hljs-string">"cvssp/audioldm2"</span>
	<span class="hljs-meta">>>> </span>pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
	<span class="hljs-meta">>>> </span>pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Get text embedding vectors</span>
	<span class="hljs-meta">>>> </span>prompt_embeds, attention_mask, generated_prompt_embeds = pipe.encode_prompt(
	<span class="hljs-meta">... </span> prompt=<span class="hljs-string">"Techno music with a strong, upbeat tempo and high melodic riffs"</span>,
	<span class="hljs-meta">... </span> device=<span class="hljs-string">"cuda"</span>,
	<span class="hljs-meta">... </span> do_classifier_free_guidance=<span class="hljs-literal">True</span>,
	<span class="hljs-meta">... </span>)

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Pass text embeddings to pipeline for text-conditional audio generation</span>
	<span class="hljs-meta">>>> </span>audio = pipe(
	<span class="hljs-meta">... </span> prompt_embeds=prompt_embeds,
	<span class="hljs-meta">... </span> attention_mask=attention_mask,
	<span class="hljs-meta">... </span> generated_prompt_embeds=generated_prompt_embeds,
	<span class="hljs-meta">... </span> num_inference_steps=<span class="hljs-number">200</span>,
	<span class="hljs-meta">... </span> audio_length_in_s=<span class="hljs-number">10.0</span>,
	<span class="hljs-meta">... </span>).audios[<span class="hljs-number">0</span>]

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># save generated audio sample</span>
	<span class="hljs-meta">>>> </span>scipy.io.wavfile.write(<span class="hljs-string">"techno.wav"</span>, rate=<span class="hljs-number">16000</span>, data=audio)`,wrap:!1}}),{c(){p=r("p"),p.textContent=T,M=n(),c(y.$$.fragment)},l(l){p=d(l,"P",{"data-svelte-h":!0}),_(p)!=="svelte-11lpom8"&&(p.textContent=T),M=s(l),m(y.$$.fragment,l)},m(l,x){a(l,p,x),a(l,M,x),u(y,l,x),w=!0},p:Ft,i(l){w\|\|(f(y.$$.fragment,l),w=!0)},o(l){g(y.$$.fragment,l),w=!1},d(l){l&&(o(p),o(M)),h(y,l)}}}function Bo(B){let p,T,M,y,w,l,x,Qe,H,Ot='AudioLDM 2 was proposed in <a href="https://huggingface.co/papers/2308.05734" rel="nofollow">AudioLDM 2: Learning Holistic Audio Generation with Self-supervised Pretraining</a> by Haohe Liu et al. AudioLDM 2 takes a text prompt as input and predicts the corresponding audio. It can generate text-conditional sound effects, human speech and music.',Xe,V,Yt='Inspired by <a href="https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview" rel="nofollow">Stable Diffusion</a>, AudioLDM 2 is a text-to-audio <em>latent diffusion model (LDM)</em> that learns continuous audio representations from text embeddings. Two text encoder models are used to compute the text embeddings from a prompt input: the text-branch of <a href="https://huggingface.co/docs/transformers/main/en/model_doc/clap" rel="nofollow">CLAP</a> and the encoder of <a href="https://huggingface.co/docs/transformers/main/en/model_doc/flan-t5" rel="nofollow">Flan-T5</a>. These text embeddings are then projected to a shared embedding space by an <a href="https://huggingface.co/docs/diffusers/main/api/pipelines/audioldm2#diffusers.AudioLDM2ProjectionModel" rel="nofollow">AudioLDM2ProjectionModel</a>. A <a href="https://huggingface.co/docs/transformers/main/en/model_doc/gpt2" rel="nofollow">GPT2</a> <em>language model (LM)</em> is used to auto-regressively predict eight new embedding vectors, conditional on the projected CLAP and Flan-T5 embeddings. The generated embedding vectors and Flan-T5 text embeddings are used as cross-attention conditioning in the LDM. The <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2UNet2DConditionModel" rel="nofollow">UNet</a> of AudioLDM 2 is unique in the sense that it takes <strong>two</strong> cross-attention embeddings, as opposed to one cross-attention conditioning, as in most other LDMs.',He,z,Kt="The abstract of the paper is the following:",Ve,F,eo='<em>Although audio generation shares commonalities across different types of audio, such as speech, music, and sound effects, designing models for each type requires careful consideration of specific objectives and biases that can significantly differ from those of other types. To bring us closer to a unified perspective of audio generation, this paper proposes a framework that utilizes the same learning method for speech, music, and sound effect generation. Our framework introduces a general representation of audio, called “language of audio” (LOA). Any audio can be translated into LOA based on AudioMAE, a self-supervised pre-trained representation learning model. In the generation process, we translate any modalities into LOA by using a GPT-2 model, and we perform self-supervised audio generation learning with a latent diffusion model conditioned on LOA. The proposed framework naturally brings advantages such as in-context learning abilities and reusable self-supervised pretrained AudioMAE and latent diffusion models. Experiments on the major benchmarks of text-to-audio, text-to-music, and text-to-speech demonstrate state-of-the-art or competitive performance against previous approaches. Our code, pretrained model, and demo are available at <a href="https://audioldm.github.io/audioldm2" rel="nofollow">this https URL</a>.</em>',ze,R,to=`This pipeline was contributed by <a href="https://huggingface.co/sanchit-gandhi" rel="nofollow">sanchit-gandhi</a> and <a href="https://github.com/tuanh123789" rel="nofollow">Nguyễn Công Tú Anh</a>. The original codebase can be
	found at <a href="https://github.com/haoheliu/audioldm2" rel="nofollow">haoheliu/audioldm2</a>.`,Fe,O,Re,Y,Oe,K,oo="AudioLDM2 comes in three variants. Two of these checkpoints are applicable to the general task of text-to-audio generation. The third checkpoint is trained exclusively on text-to-music generation.",Ye,ee,no=`All checkpoints share the same model size for the text encoders and VAE. They differ in the size and depth of the UNet.
	See table below for details on the three checkpoints:`,Ke,te,so='<thead><tr><th>Checkpoint</th> <th>Task</th> <th>UNet Model Size</th> <th>Total Model Size</th> <th>Training Data / h</th></tr></thead> <tbody><tr><td><a href="https://huggingface.co/cvssp/audioldm2" rel="nofollow">audioldm2</a></td> <td>Text-to-audio</td> <td>350M</td> <td>1.1B</td> <td>1150k</td></tr> <tr><td><a href="https://huggingface.co/cvssp/audioldm2-large" rel="nofollow">audioldm2-large</a></td> <td>Text-to-audio</td> <td>750M</td> <td>1.5B</td> <td>1150k</td></tr> <tr><td><a href="https://huggingface.co/cvssp/audioldm2-music" rel="nofollow">audioldm2-music</a></td> <td>Text-to-music</td> <td>350M</td> <td>1.1B</td> <td>665k</td></tr> <tr><td><a href="https://huggingface.co/anhnct/audioldm2_gigaspeech" rel="nofollow">audioldm2-gigaspeech</a></td> <td>Text-to-speech</td> <td>350M</td> <td>1.1B</td> <td>10k</td></tr> <tr><td><a href="https://huggingface.co/anhnct/audioldm2_ljspeech" rel="nofollow">audioldm2-ljspeech</a></td> <td>Text-to-speech</td> <td>350M</td> <td>1.1B</td> <td></td></tr></tbody>',et,oe,tt,ne,ao="<li>Descriptive prompt inputs work best: use adjectives to describe the sound (e.g. “high quality” or “clear”) and make the prompt context specific (e.g. “water stream in a forest” instead of “stream”).</li> <li>It’s best to use general terms like “cat” or “dog” instead of specific names or abstract objects the model may not be familiar with.</li> <li>Using a <strong>negative prompt</strong> can significantly improve the quality of the generated waveform, by guiding the generation away from terms that correspond to poor quality audio. Try using a negative prompt of “Low quality.”</li>",ot,se,nt,ae,io="<li>The <em>quality</em> of the predicted audio sample can be controlled by the <code>num_inference_steps</code> argument; higher steps give higher quality audio at the expense of slower inference.</li> <li>The <em>length</em> of the predicted audio sample can be controlled by varying the <code>audio_length_in_s</code> argument.</li>",st,ie,at,re,ro="<li>The quality of the generated waveforms can vary significantly based on the seed. Try generating with different seeds until you find a satisfactory generation.</li> <li>Multiple waveforms can be generated in one go: set <code>num_waveforms_per_prompt</code> to a value greater than 1. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly.</li>",it,de,lo='The following example demonstrates how to construct good music and speech generation using the aforementioned tips: <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example" rel="nofollow">example</a>.',rt,P,po='<p>Make sure to check out the Schedulers <a href="../../using-diffusers/schedulers">guide</a> to learn how to explore the tradeoff between scheduler speed and quality, and see the <a href="../../using-diffusers/loading#reuse-a-pipeline">reuse components across pipelines</a> section to learn how to efficiently load the same components into multiple pipelines.</p>',dt,le,lt,b,pe,xt,ke,co="Pipeline for text-to-audio generation using AudioLDM2.",jt,Ae,mo=`This model inherits from <a href="/docs/diffusers/pr_11636/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a>. Check the superclass documentation for the generic methods
	implemented for all pipelines (downloading, saving, running on a particular device, etc.).`,Dt,A,ce,kt,Ue,uo="The call function to the pipeline for generation.",At,I,Ut,Z,Lt,G,me,$t,Le,fo=`Disable sliced VAE decoding. If <code>enable_vae_slicing</code> was previously enabled, this method will go back to
	computing decoding in one step.`,Ct,q,ue,Nt,$e,go=`Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
	to <code>enable_sequential_cpu_offload</code>, this method moves one whole model at a time to the GPU when its <code>forward</code>
	method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
	<code>enable_sequential_cpu_offload</code>, but performance is much better due to the iterative execution of the <code>unet</code>.`,Jt,E,fe,Bt,Ce,ho=`Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
	compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.`,Pt,C,ge,It,Ne,_o="Encodes the prompt into text encoder hidden states.",Zt,W,Gt,S,he,qt,Je,Mo="Generates a sequence of hidden-states from the language model, conditioned on the embedding inputs.",pt,_e,ct,L,Me,Et,Be,bo=`A simple linear projection model to map two text embeddings to a shared latent space. It also inserts learned
	embedding vectors at the start and end of each text embedding sequence respectively. Each variable appended with
	<code>_1</code> refers to that corresponding to the second text encoder. Otherwise, it is from the first.`,Wt,Pe,be,mt,ye,ut,k,ve,St,Ie,yo=`A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
	shaped output. Compared to the vanilla <a href="/docs/diffusers/pr_11636/en/api/models/unet2d-cond#diffusers.UNet2DConditionModel">UNet2DConditionModel</a>, this variant optionally includes an additional
	self-attention layer in each Transformer block, as well as multiple cross-attention layers. It also allows for up
	to two cross-attention embeddings, <code>encoder_hidden_states</code> and <code>encoder_hidden_states_1</code>.`,Qt,Ze,vo=`This model inherits from <a href="/docs/diffusers/pr_11636/en/api/models/overview#diffusers.ModelMixin">ModelMixin</a>. Check the superclass documentation for it’s generic methods implemented
	for all models (such as downloading or saving).`,Xt,Q,we,Ht,Ge,wo='The <a href="/docs/diffusers/pr_11636/en/api/pipelines/audioldm2#diffusers.AudioLDM2UNet2DConditionModel">AudioLDM2UNet2DConditionModel</a> forward method.',ft,Te,gt,J,xe,Vt,qe,To="Output class for audio pipelines.",ht,je,_t,Se,Mt;return w=new Lo({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),x=new N({props:{title:"AudioLDM 2",local:"audioldm-2",headingTag:"h1"}}),O=new N({props:{title:"Tips",local:"tips",headingTag:"h2"}}),Y=new N({props:{title:"Choosing a checkpoint",local:"choosing-a-checkpoint",headingTag:"h3"}}),oe=new N({props:{title:"Constructing a prompt",local:"constructing-a-prompt",headingTag:"h3"}}),se=new N({props:{title:"Controlling inference",local:"controlling-inference",headingTag:"h3"}}),ie=new N({props:{title:"Evaluating generated waveforms:",local:"evaluating-generated-waveforms",headingTag:"h3"}}),le=new N({props:{title:"AudioLDM2Pipeline",local:"diffusers.AudioLDM2Pipeline",headingTag:"h2"}}),pe=new U({props:{name:"class diffusers.AudioLDM2Pipeline",anchor:"diffusers.AudioLDM2Pipeline",parameters:[{name:"vae",val:": AutoencoderKL"},{name:"text_encoder",val:": ClapModel"},{name:"text_encoder_2",val:": typing.Union[transformers.models.t5.modeling_t5.T5EncoderModel, transformers.models.vits.modeling_vits.VitsModel]"},{name:"projection_model",val:": AudioLDM2ProjectionModel"},{name:"language_model",val:": GPT2LMHeadModel"},{name:"tokenizer",val:": typing.Union[transformers.models.roberta.tokenization_roberta.RobertaTokenizer, transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast]"},{name:"tokenizer_2",val:": typing.Union[transformers.models.t5.tokenization_t5.T5Tokenizer, transformers.models.t5.tokenization_t5_fast.T5TokenizerFast, transformers.models.vits.tokenization_vits.VitsTokenizer]"},{name:"feature_extractor",val:": ClapFeatureExtractor"},{name:"unet",val:": AudioLDM2UNet2DConditionModel"},{name:"scheduler",val:": KarrasDiffusionSchedulers"},{name:"vocoder",val:": SpeechT5HifiGan"}],parametersDescription:[{anchor:"diffusers.AudioLDM2Pipeline.vae",description:`<strong>vae</strong> (<a href="/docs/diffusers/pr_11636/en/api/models/autoencoderkl#diffusers.AutoencoderKL">AutoencoderKL</a>) —
	Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.`,name:"vae"},{anchor:"diffusers.AudioLDM2Pipeline.text_encoder",description:`<strong>text_encoder</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/clap#transformers.ClapModel" rel="nofollow">ClapModel</a>) —
	First frozen text-encoder. AudioLDM2 uses the joint audio-text embedding model
	<a href="https://huggingface.co/docs/transformers/model_doc/clap#transformers.CLAPTextModelWithProjection" rel="nofollow">CLAP</a>,
	specifically the <a href="https://huggingface.co/laion/clap-htsat-unfused" rel="nofollow">laion/clap-htsat-unfused</a> variant. The
	text branch is used to encode the text prompt to a prompt embedding. The full audio-text model is used to
	rank generated waveforms against the text prompt by computing similarity scores.`,name:"text_encoder"},{anchor:"diffusers.AudioLDM2Pipeline.text_encoder_2",description:`<strong>text_encoder_2</strong> ([<code>~transformers.T5EncoderModel</code>, <code>~transformers.VitsModel</code>]) —
	Second frozen text-encoder. AudioLDM2 uses the encoder of
	<a href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel" rel="nofollow">T5</a>, specifically the
	<a href="https://huggingface.co/google/flan-t5-large" rel="nofollow">google/flan-t5-large</a> variant. Second frozen text-encoder use
	for TTS. AudioLDM2 uses the encoder of
	<a href="https://huggingface.co/docs/transformers/model_doc/vits#transformers.VitsModel" rel="nofollow">Vits</a>.`,name:"text_encoder_2"},{anchor:"diffusers.AudioLDM2Pipeline.projection_model",description:`<strong>projection_model</strong> (<a href="/docs/diffusers/pr_11636/en/api/pipelines/audioldm2#diffusers.AudioLDM2ProjectionModel">AudioLDM2ProjectionModel</a>) —
	A trained model used to linearly project the hidden-states from the first and second text encoder models
	and insert learned SOS and EOS token embeddings. The projected hidden-states from the two text encoders are
	concatenated to give the input to the language model. A Learned Position Embedding for the Vits
	hidden-states`,name:"projection_model"},{anchor:"diffusers.AudioLDM2Pipeline.language_model",description:`<strong>language_model</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/gpt2#transformers.GPT2Model" rel="nofollow">GPT2Model</a>) —
	An auto-regressive language model used to generate a sequence of hidden-states conditioned on the projected
	outputs from the two text encoders.`,name:"language_model"},{anchor:"diffusers.AudioLDM2Pipeline.tokenizer",description:`<strong>tokenizer</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/mvp#transformers.RobertaTokenizer" rel="nofollow">RobertaTokenizer</a>) —
	Tokenizer to tokenize text for the first frozen text-encoder.`,name:"tokenizer"},{anchor:"diffusers.AudioLDM2Pipeline.tokenizer_2",description:`<strong>tokenizer_2</strong> ([<code>~transformers.T5Tokenizer</code>, <code>~transformers.VitsTokenizer</code>]) —
	Tokenizer to tokenize text for the second frozen text-encoder.`,name:"tokenizer_2"},{anchor:"diffusers.AudioLDM2Pipeline.feature_extractor",description:`<strong>feature_extractor</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/clap#transformers.ClapFeatureExtractor" rel="nofollow">ClapFeatureExtractor</a>) —
	Feature extractor to pre-process generated audio waveforms to log-mel spectrograms for automatic scoring.`,name:"feature_extractor"},{anchor:"diffusers.AudioLDM2Pipeline.unet",description:`<strong>unet</strong> (<a href="/docs/diffusers/pr_11636/en/api/models/unet2d-cond#diffusers.UNet2DConditionModel">UNet2DConditionModel</a>) —
	A <code>UNet2DConditionModel</code> to denoise the encoded audio latents.`,name:"unet"},{anchor:"diffusers.AudioLDM2Pipeline.scheduler",description:`<strong>scheduler</strong> (<a href="/docs/diffusers/pr_11636/en/api/schedulers/overview#diffusers.SchedulerMixin">SchedulerMixin</a>) —
	A scheduler to be used in combination with <code>unet</code> to denoise the encoded audio latents. Can be one of
	<a href="/docs/diffusers/pr_11636/en/api/schedulers/ddim#diffusers.DDIMScheduler">DDIMScheduler</a>, <a href="/docs/diffusers/pr_11636/en/api/schedulers/lms_discrete#diffusers.LMSDiscreteScheduler">LMSDiscreteScheduler</a>, or <a href="/docs/diffusers/pr_11636/en/api/schedulers/pndm#diffusers.PNDMScheduler">PNDMScheduler</a>.`,name:"scheduler"},{anchor:"diffusers.AudioLDM2Pipeline.vocoder",description:`<strong>vocoder</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/speecht5#transformers.SpeechT5HifiGan" rel="nofollow">SpeechT5HifiGan</a>) —
	Vocoder of class <code>SpeechT5HifiGan</code> to convert the mel-spectrogram latents to the final audio waveform.`,name:"vocoder"}],source:"https://github.com/huggingface/diffusers/blob/vr_11636/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py#L150"}}),ce=new U({props:{name:"__call__",anchor:"diffusers.AudioLDM2Pipeline.__call__",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]] = None"},{name:"transcription",val:": typing.Union[str, typing.List[str]] = None"},{name:"audio_length_in_s",val:": typing.Optional[float] = None"},{name:"num_inference_steps",val:": int = 200"},{name:"guidance_scale",val:": float = 3.5"},{name:"negative_prompt",val:": typing.Union[str, typing.List[str], NoneType] = None"},{name:"num_waveforms_per_prompt",val:": typing.Optional[int] = 1"},{name:"eta",val:": float = 0.0"},{name:"generator",val:": typing.Union[torch._C.Generator, typing.List[torch._C.Generator], NoneType] = None"},{name:"latents",val:": typing.Optional[torch.Tensor] = None"},{name:"prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"generated_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_generated_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"attention_mask",val:": typing.Optional[torch.LongTensor] = None"},{name:"negative_attention_mask",val:": typing.Optional[torch.LongTensor] = None"},{name:"max_new_tokens",val:": typing.Optional[int] = None"},{name:"return_dict",val:": bool = True"},{name:"callback",val:": typing.Optional[typing.Callable[[int, int, torch.Tensor], NoneType]] = None"},{name:"callback_steps",val:": typing.Optional[int] = 1"},{name:"cross_attention_kwargs",val:": typing.Optional[typing.Dict[str, typing.Any]] = None"},{name:"output_type",val:": typing.Optional[str] = 'np'"}],parametersDescription:[{anchor:"diffusers.AudioLDM2Pipeline.__call__.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) —
	The prompt or prompts to guide audio generation. If not defined, you need to pass <code>prompt_embeds</code>.`,name:"prompt"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.transcription",description:`<strong>transcription</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) —\\
	The transcript for text to speech.`,name:"transcription"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.audio_length_in_s",description:`<strong>audio_length_in_s</strong> (<code>int</code>, <em>optional</em>, defaults to 10.24) —
	The length of the generated audio sample in seconds.`,name:"audio_length_in_s"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.num_inference_steps",description:`<strong>num_inference_steps</strong> (<code>int</code>, <em>optional</em>, defaults to 200) —
	The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
	expense of slower inference.`,name:"num_inference_steps"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.guidance_scale",description:`<strong>guidance_scale</strong> (<code>float</code>, <em>optional</em>, defaults to 3.5) —
	A higher guidance scale value encourages the model to generate audio that is closely linked to the text
	<code>prompt</code> at the expense of lower sound quality. Guidance scale is enabled when <code>guidance_scale > 1</code>.`,name:"guidance_scale"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.negative_prompt",description:`<strong>negative_prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) —
	The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
	pass <code>negative_prompt_embeds</code> instead. Ignored when not using guidance (<code>guidance_scale < 1</code>).`,name:"negative_prompt"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.num_waveforms_per_prompt",description:`<strong>num_waveforms_per_prompt</strong> (<code>int</code>, <em>optional</em>, defaults to 1) —
	The number of waveforms to generate per prompt. If <code>num_waveforms_per_prompt > 1</code>, then automatic
	scoring is performed between the generated outputs and the text prompt. This scoring ranks the
	generated waveforms based on their cosine similarity with the text input in the joint text-audio
	embedding space.`,name:"num_waveforms_per_prompt"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.eta",description:`<strong>eta</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) —
	Corresponds to parameter eta (η) from the <a href="https://huggingface.co/papers/2010.02502" rel="nofollow">DDIM</a> paper. Only
	applies to the <a href="/docs/diffusers/pr_11636/en/api/schedulers/ddim#diffusers.DDIMScheduler">DDIMScheduler</a>, and is ignored in other schedulers.`,name:"eta"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.generator",description:`<strong>generator</strong> (<code>torch.Generator</code> or <code>List[torch.Generator]</code>, <em>optional</em>) —
	A <a href="https://pytorch.org/docs/stable/generated/torch.Generator.html" rel="nofollow"><code>torch.Generator</code></a> to make
	generation deterministic.`,name:"generator"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.latents",description:`<strong>latents</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for spectrogram
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor is generated by sampling using the supplied random <code>generator</code>.`,name:"latents"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.prompt_embeds",description:`<strong>prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
	provided, text embeddings are generated from the <code>prompt</code> input argument.`,name:"prompt_embeds"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.negative_prompt_embeds",description:`<strong>negative_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
	not provided, <code>negative_prompt_embeds</code> are generated from the <code>negative_prompt</code> input argument.`,name:"negative_prompt_embeds"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.generated_prompt_embeds",description:`<strong>generated_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated text embeddings from the GPT2 language model. Can be used to easily tweak text inputs,
	<em>e.g.</em> prompt weighting. If not provided, text embeddings will be generated from <code>prompt</code> input
	argument.`,name:"generated_prompt_embeds"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.negative_generated_prompt_embeds",description:`<strong>negative_generated_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
	inputs, <em>e.g.</em> prompt weighting. If not provided, negative_prompt_embeds will be computed from
	<code>negative_prompt</code> input argument.`,name:"negative_generated_prompt_embeds"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.attention_mask",description:`<strong>attention_mask</strong> (<code>torch.LongTensor</code>, <em>optional</em>) —
	Pre-computed attention mask to be applied to the <code>prompt_embeds</code>. If not provided, attention mask will
	be computed from <code>prompt</code> input argument.`,name:"attention_mask"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.negative_attention_mask",description:`<strong>negative_attention_mask</strong> (<code>torch.LongTensor</code>, <em>optional</em>) —
	Pre-computed attention mask to be applied to the <code>negative_prompt_embeds</code>. If not provided, attention
	mask will be computed from <code>negative_prompt</code> input argument.`,name:"negative_attention_mask"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.max_new_tokens",description:`<strong>max_new_tokens</strong> (<code>int</code>, <em>optional</em>, defaults to None) —
	Number of new tokens to generate with the GPT2 language model. If not provided, number of tokens will
	be taken from the config of the model.`,name:"max_new_tokens"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) —
	Whether or not to return a <a href="/docs/diffusers/pr_11636/en/api/pipelines/stable_diffusion/text2img#diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput">StableDiffusionPipelineOutput</a> instead of a
	plain tuple.`,name:"return_dict"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.callback",description:`<strong>callback</strong> (<code>Callable</code>, <em>optional</em>) —
	A function that calls every <code>callback_steps</code> steps during inference. The function is called with the
	following arguments: <code>callback(step: int, timestep: int, latents: torch.Tensor)</code>.`,name:"callback"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.callback_steps",description:`<strong>callback_steps</strong> (<code>int</code>, <em>optional</em>, defaults to 1) —
	The frequency at which the <code>callback</code> function is called. If not specified, the callback is called at
	every step.`,name:"callback_steps"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.cross_attention_kwargs",description:`<strong>cross_attention_kwargs</strong> (<code>dict</code>, <em>optional</em>) —
	A kwargs dictionary that if specified is passed along to the <code>AttentionProcessor</code> as defined in
	<a href="https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py" rel="nofollow"><code>self.processor</code></a>.`,name:"cross_attention_kwargs"},{anchor:"diffusers.AudioLDM2Pipeline.__call__.output_type",description:`<strong>output_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"np"</code>) —
	The output format of the generated audio. Choose between <code>"np"</code> to return a NumPy <code>np.ndarray</code> or
	<code>"pt"</code> to return a PyTorch <code>torch.Tensor</code> object. Set to <code>"latent"</code> to return the latent diffusion
	model (LDM) output.`,name:"output_type"}],source:"https://github.com/huggingface/diffusers/blob/vr_11636/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py#L861",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>If <code>return_dict</code> is <code>True</code>, <a
	href="/docs/diffusers/pr_11636/en/api/pipelines/stable_diffusion/text2img#diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput"
	>StableDiffusionPipelineOutput</a> is returned,
	otherwise a <code>tuple</code> is returned where the first element is a list with the generated audio.</p>
	`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


	<p><a
	href="/docs/diffusers/pr_11636/en/api/pipelines/stable_diffusion/text2img#diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput"
	>StableDiffusionPipelineOutput</a> or <code>tuple</code></p>
	`}}),I=new zt({props:{anchor:"diffusers.AudioLDM2Pipeline.__call__.example",$$slots:{default:[Co]},$$scope:{ctx:B}}}),Z=new zt({props:{anchor:"diffusers.AudioLDM2Pipeline.__call__.example-2",$$slots:{default:[No]},$$scope:{ctx:B}}}),me=new U({props:{name:"disable_vae_slicing",anchor:"diffusers.AudioLDM2Pipeline.disable_vae_slicing",parameters:[],source:"https://github.com/huggingface/diffusers/blob/vr_11636/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py#L241"}}),ue=new U({props:{name:"enable_model_cpu_offload",anchor:"diffusers.AudioLDM2Pipeline.enable_model_cpu_offload",parameters:[{name:"gpu_id",val:": typing.Optional[int] = None"},{name:"device",val:": typing.Union[torch.device, str] = 'cuda'"}],source:"https://github.com/huggingface/diffusers/blob/vr_11636/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py#L254"}}),fe=new U({props:{name:"enable_vae_slicing",anchor:"diffusers.AudioLDM2Pipeline.enable_vae_slicing",parameters:[],source:"https://github.com/huggingface/diffusers/blob/vr_11636/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py#L227"}}),ge=new U({props:{name:"encode_prompt",anchor:"diffusers.AudioLDM2Pipeline.encode_prompt",parameters:[{name:"prompt",val:""},{name:"device",val:""},{name:"num_waveforms_per_prompt",val:""},{name:"do_classifier_free_guidance",val:""},{name:"transcription",val:" = None"},{name:"negative_prompt",val:" = None"},{name:"prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"generated_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_generated_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"attention_mask",val:": typing.Optional[torch.LongTensor] = None"},{name:"negative_attention_mask",val:": typing.Optional[torch.LongTensor] = None"},{name:"max_new_tokens",val:": typing.Optional[int] = None"}],parametersDescription:[{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) —
	prompt to be encoded`,name:"prompt"},{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.transcription",description:`<strong>transcription</strong> (<code>str</code> or <code>List[str]</code>) —
	transcription of text to speech`,name:"transcription"},{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.device",description:`<strong>device</strong> (<code>torch.device</code>) —
	torch device`,name:"device"},{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.num_waveforms_per_prompt",description:`<strong>num_waveforms_per_prompt</strong> (<code>int</code>) —
	number of waveforms that should be generated per prompt`,name:"num_waveforms_per_prompt"},{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.do_classifier_free_guidance",description:`<strong>do_classifier_free_guidance</strong> (<code>bool</code>) —
	whether to use classifier free guidance or not`,name:"do_classifier_free_guidance"},{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.negative_prompt",description:`<strong>negative_prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) —
	The prompt or prompts not to guide the audio generation. If not defined, one has to pass
	<code>negative_prompt_embeds</code> instead. Ignored when not using guidance (i.e., ignored if <code>guidance_scale</code> is
	less than <code>1</code>).`,name:"negative_prompt"},{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.prompt_embeds",description:`<strong>prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-computed text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, <em>e.g.</em>
	prompt weighting. If not provided, text embeddings will be computed from <code>prompt</code> input argument.`,name:"prompt_embeds"},{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.negative_prompt_embeds",description:`<strong>negative_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-computed negative text embeddings from the Flan T5 model. Can be used to easily tweak text inputs,
	<em>e.g.</em> prompt weighting. If not provided, negative_prompt_embeds will be computed from
	<code>negative_prompt</code> input argument.`,name:"negative_prompt_embeds"},{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.generated_prompt_embeds",description:`<strong>generated_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated text embeddings from the GPT2 language model. Can be used to easily tweak text inputs,
	<em>e.g.</em> prompt weighting. If not provided, text embeddings will be generated from <code>prompt</code> input
	argument.`,name:"generated_prompt_embeds"},{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.negative_generated_prompt_embeds",description:`<strong>negative_generated_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
	inputs, <em>e.g.</em> prompt weighting. If not provided, negative_prompt_embeds will be computed from
	<code>negative_prompt</code> input argument.`,name:"negative_generated_prompt_embeds"},{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.attention_mask",description:`<strong>attention_mask</strong> (<code>torch.LongTensor</code>, <em>optional</em>) —
	Pre-computed attention mask to be applied to the <code>prompt_embeds</code>. If not provided, attention mask will
	be computed from <code>prompt</code> input argument.`,name:"attention_mask"},{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.negative_attention_mask",description:`<strong>negative_attention_mask</strong> (<code>torch.LongTensor</code>, <em>optional</em>) —
	Pre-computed attention mask to be applied to the <code>negative_prompt_embeds</code>. If not provided, attention
	mask will be computed from <code>negative_prompt</code> input argument.`,name:"negative_attention_mask"},{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.max_new_tokens",description:`<strong>max_new_tokens</strong> (<code>int</code>, <em>optional</em>, defaults to None) —
	The number of new tokens to generate with the GPT2 language model.`,name:"max_new_tokens"}],source:"https://github.com/huggingface/diffusers/blob/vr_11636/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py#L356",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>Text embeddings from the Flan T5 model.
	attention_mask (<code>torch.LongTensor</code>):
	Attention mask to be applied to the <code>prompt_embeds</code>.
	generated_prompt_embeds (<code>torch.Tensor</code>):
	Text embeddings generated from the GPT2 language model.</p>
	`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>prompt_embeds (<code>torch.Tensor</code>)</p>
	`}}),W=new zt({props:{anchor:"diffusers.AudioLDM2Pipeline.encode_prompt.example",$$slots:{default:[Jo]},$$scope:{ctx:B}}}),he=new U({props:{name:"generate_language_model",anchor:"diffusers.AudioLDM2Pipeline.generate_language_model",parameters:[{name:"inputs_embeds",val:": Tensor = None"},{name:"max_new_tokens",val:": int = 8"},{name:"**model_kwargs",val:""}],parametersDescription:[{anchor:"diffusers.AudioLDM2Pipeline.generate_language_model.inputs_embeds",description:`<strong>inputs_embeds</strong> (<code>torch.Tensor</code> of shape <code>(batch_size, sequence_length, hidden_size)</code>) —
	The sequence used as a prompt for the generation.`,name:"inputs_embeds"},{anchor:"diffusers.AudioLDM2Pipeline.generate_language_model.max_new_tokens",description:`<strong>max_new_tokens</strong> (<code>int</code>) —
	Number of new tokens to generate.`,name:"max_new_tokens"},{anchor:"diffusers.AudioLDM2Pipeline.generate_language_model.model_kwargs",description:`<strong>model_kwargs</strong> (<code>Dict[str, Any]</code>, <em>optional</em>) —
	Ad hoc parametrization of additional model-specific kwargs that will be forwarded to the <code>forward</code>
	function of the model.`,name:"model_kwargs"}],source:"https://github.com/huggingface/diffusers/blob/vr_11636/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py#L304",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>The sequence of generated hidden-states.</p>
	`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


	<p><code>inputs_embeds (</code>torch.Tensor<code>of shape</code>(batch_size, sequence_length, hidden_size)\`)</p>
	`}}),_e=new N({props:{title:"AudioLDM2ProjectionModel",local:"diffusers.AudioLDM2ProjectionModel",headingTag:"h2"}}),Me=new U({props:{name:"class diffusers.AudioLDM2ProjectionModel",anchor:"diffusers.AudioLDM2ProjectionModel",parameters:[{name:"text_encoder_dim",val:""},{name:"text_encoder_1_dim",val:""},{name:"langauge_model_dim",val:""},{name:"use_learned_position_embedding",val:" = None"},{name:"max_seq_length",val:" = None"}],parametersDescription:[{anchor:"diffusers.AudioLDM2ProjectionModel.text_encoder_dim",description:`<strong>text_encoder_dim</strong> (<code>int</code>) —
	Dimensionality of the text embeddings from the first text encoder (CLAP).`,name:"text_encoder_dim"},{anchor:"diffusers.AudioLDM2ProjectionModel.text_encoder_1_dim",description:`<strong>text_encoder_1_dim</strong> (<code>int</code>) —
	Dimensionality of the text embeddings from the second text encoder (T5 or VITS).`,name:"text_encoder_1_dim"},{anchor:"diffusers.AudioLDM2ProjectionModel.langauge_model_dim",description:`<strong>langauge_model_dim</strong> (<code>int</code>) —
	Dimensionality of the text embeddings from the language model (GPT2).`,name:"langauge_model_dim"}],source:"https://github.com/huggingface/diffusers/blob/vr_11636/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py#L81"}}),be=new U({props:{name:"forward",anchor:"diffusers.AudioLDM2ProjectionModel.forward",parameters:[{name:"hidden_states",val:": typing.Optional[torch.Tensor] = None"},{name:"hidden_states_1",val:": typing.Optional[torch.Tensor] = None"},{name:"attention_mask",val:": typing.Optional[torch.LongTensor] = None"},{name:"attention_mask_1",val:": typing.Optional[torch.LongTensor] = None"}],source:"https://github.com/huggingface/diffusers/blob/vr_11636/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py#L125"}}),ye=new N({props:{title:"AudioLDM2UNet2DConditionModel",local:"diffusers.AudioLDM2UNet2DConditionModel",headingTag:"h2"}}),ve=new U({props:{name:"class diffusers.AudioLDM2UNet2DConditionModel",anchor:"diffusers.AudioLDM2UNet2DConditionModel",parameters:[{name:"sample_size",val:": typing.Optional[int] = None"},{name:"in_channels",val:": int = 4"},{name:"out_channels",val:": int = 4"},{name:"flip_sin_to_cos",val:": bool = True"},{name:"freq_shift",val:": int = 0"},{name:"down_block_types",val:": typing.Tuple[str, ...] = ('CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'DownBlock2D')"},{name:"mid_block_type",val:": typing.Optional[str] = 'UNetMidBlock2DCrossAttn'"},{name:"up_block_types",val:": typing.Tuple[str, ...] = ('UpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D')"},{name:"only_cross_attention",val:": typing.Union[bool, typing.Tuple[bool]] = False"},{name:"block_out_channels",val:": typing.Tuple[int, ...] = (320, 640, 1280, 1280)"},{name:"layers_per_block",val:": typing.Union[int, typing.Tuple[int]] = 2"},{name:"downsample_padding",val:": int = 1"},{name:"mid_block_scale_factor",val:": float = 1"},{name:"act_fn",val:": str = 'silu'"},{name:"norm_num_groups",val:": typing.Optional[int] = 32"},{name:"norm_eps",val:": float = 1e-05"},{name:"cross_attention_dim",val:": typing.Union[int, typing.Tuple[int]] = 1280"},{name:"transformer_layers_per_block",val:": typing.Union[int, typing.Tuple[int]] = 1"},{name:"attention_head_dim",val:": typing.Union[int, typing.Tuple[int]] = 8"},{name:"num_attention_heads",val:": typing.Union[int, typing.Tuple[int], NoneType] = None"},{name:"use_linear_projection",val:": bool = False"},{name:"class_embed_type",val:": typing.Optional[str] = None"},{name:"num_class_embeds",val:": typing.Optional[int] = None"},{name:"upcast_attention",val:": bool = False"},{name:"resnet_time_scale_shift",val:": str = 'default'"},{name:"time_embedding_type",val:": str = 'positional'"},{name:"time_embedding_dim",val:": typing.Optional[int] = None"},{name:"time_embedding_act_fn",val:": typing.Optional[str] = None"},{name:"timestep_post_act",val:": typing.Optional[str] = None"},{name:"time_cond_proj_dim",val:": typing.Optional[int] = None"},{name:"conv_in_kernel",val:": int = 3"},{name:"conv_out_kernel",val:": int = 3"},{name:"projection_class_embeddings_input_dim",val:": typing.Optional[int] = None"},{name:"class_embeddings_concat",val:": bool = False"}],parametersDescription:[{anchor:"diffusers.AudioLDM2UNet2DConditionModel.sample_size",description:`<strong>sample_size</strong> (<code>int</code> or <code>Tuple[int, int]</code>, <em>optional</em>, defaults to <code>None</code>) —
	Height and width of input/output sample.`,name:"sample_size"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.in_channels",description:"<strong>in_channels</strong> (<code>int</code>, <em>optional</em>, defaults to 4) — Number of channels in the input sample.",name:"in_channels"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.out_channels",description:"<strong>out_channels</strong> (<code>int</code>, <em>optional</em>, defaults to 4) — Number of channels in the output.",name:"out_channels"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.flip_sin_to_cos",description:`<strong>flip_sin_to_cos</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) —
	Whether to flip the sin to cos in the time embedding.`,name:"flip_sin_to_cos"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.freq_shift",description:"<strong>freq_shift</strong> (<code>int</code>, <em>optional</em>, defaults to 0) — The frequency shift to apply to the time embedding.",name:"freq_shift"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.down_block_types",description:`<strong>down_block_types</strong> (<code>Tuple[str]</code>, <em>optional</em>, defaults to <code>("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")</code>) —
	The tuple of downsample blocks to use.`,name:"down_block_types"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.mid_block_type",description:`<strong>mid_block_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"UNetMidBlock2DCrossAttn"</code>) —
	Block type for middle of UNet, it can only be <code>UNetMidBlock2DCrossAttn</code> for AudioLDM2.`,name:"mid_block_type"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.up_block_types",description:`<strong>up_block_types</strong> (<code>Tuple[str]</code>, <em>optional</em>, defaults to <code>("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")</code>) —
	The tuple of upsample blocks to use.`,name:"up_block_types"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.only_cross_attention",description:`<strong>only_cross_attention</strong> (<code>bool</code> or <code>Tuple[bool]</code>, <em>optional</em>, default to <code>False</code>) —
	Whether to include self-attention in the basic transformer blocks, see
	<code>BasicTransformerBlock</code>.`,name:"only_cross_attention"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.block_out_channels",description:`<strong>block_out_channels</strong> (<code>Tuple[int]</code>, <em>optional</em>, defaults to <code>(320, 640, 1280, 1280)</code>) —
	The tuple of output channels for each block.`,name:"block_out_channels"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.layers_per_block",description:"<strong>layers_per_block</strong> (<code>int</code>, <em>optional</em>, defaults to 2) — The number of layers per block.",name:"layers_per_block"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.downsample_padding",description:"<strong>downsample_padding</strong> (<code>int</code>, <em>optional</em>, defaults to 1) — The padding to use for the downsampling convolution.",name:"downsample_padding"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.mid_block_scale_factor",description:"<strong>mid_block_scale_factor</strong> (<code>float</code>, <em>optional</em>, defaults to 1.0) — The scale factor to use for the mid block.",name:"mid_block_scale_factor"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.act_fn",description:"<strong>act_fn</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"silu"</code>) — The activation function to use.",name:"act_fn"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.norm_num_groups",description:`<strong>norm_num_groups</strong> (<code>int</code>, <em>optional</em>, defaults to 32) — The number of groups to use for the normalization.
	If <code>None</code>, normalization and activation layers is skipped in post-processing.`,name:"norm_num_groups"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.norm_eps",description:"<strong>norm_eps</strong> (<code>float</code>, <em>optional</em>, defaults to 1e-5) — The epsilon to use for the normalization.",name:"norm_eps"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.cross_attention_dim",description:`<strong>cross_attention_dim</strong> (<code>int</code> or <code>Tuple[int]</code>, <em>optional</em>, defaults to 1280) —
	The dimension of the cross attention features.`,name:"cross_attention_dim"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.transformer_layers_per_block",description:`<strong>transformer_layers_per_block</strong> (<code>int</code> or <code>Tuple[int]</code>, <em>optional</em>, defaults to 1) —
	The number of transformer blocks of type <code>BasicTransformerBlock</code>. Only relevant for
	<code>~models.unet_2d_blocks.CrossAttnDownBlock2D</code>, <code>~models.unet_2d_blocks.CrossAttnUpBlock2D</code>,
	<code>~models.unet_2d_blocks.UNetMidBlock2DCrossAttn</code>.`,name:"transformer_layers_per_block"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.attention_head_dim",description:"<strong>attention_head_dim</strong> (<code>int</code>, <em>optional</em>, defaults to 8) — The dimension of the attention heads.",name:"attention_head_dim"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.num_attention_heads",description:`<strong>num_attention_heads</strong> (<code>int</code>, <em>optional</em>) —
	The number of attention heads. If not defined, defaults to <code>attention_head_dim</code>`,name:"num_attention_heads"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.resnet_time_scale_shift",description:`<strong>resnet_time_scale_shift</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"default"</code>) — Time scale shift config
	for ResNet blocks (see <code>ResnetBlock2D</code>). Choose from <code>default</code> or <code>scale_shift</code>.`,name:"resnet_time_scale_shift"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.class_embed_type",description:`<strong>class_embed_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>None</code>) —
	The type of class embedding to use which is ultimately summed with the time embeddings. Choose from <code>None</code>,
	<code>"timestep"</code>, <code>"identity"</code>, <code>"projection"</code>, or <code>"simple_projection"</code>.`,name:"class_embed_type"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.num_class_embeds",description:`<strong>num_class_embeds</strong> (<code>int</code>, <em>optional</em>, defaults to <code>None</code>) —
	Input dimension of the learnable embedding matrix to be projected to <code>time_embed_dim</code>, when performing
	class conditioning with <code>class_embed_type</code> equal to <code>None</code>.`,name:"num_class_embeds"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.time_embedding_type",description:`<strong>time_embedding_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>positional</code>) —
	The type of position embedding to use for timesteps. Choose from <code>positional</code> or <code>fourier</code>.`,name:"time_embedding_type"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.time_embedding_dim",description:`<strong>time_embedding_dim</strong> (<code>int</code>, <em>optional</em>, defaults to <code>None</code>) —
	An optional override for the dimension of the projected time embedding.`,name:"time_embedding_dim"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.time_embedding_act_fn",description:`<strong>time_embedding_act_fn</strong> (<code>str</code>, <em>optional</em>, defaults to <code>None</code>) —
	Optional activation function to use only once on the time embeddings before they are passed to the rest of
	the UNet. Choose from <code>silu</code>, <code>mish</code>, <code>gelu</code>, and <code>swish</code>.`,name:"time_embedding_act_fn"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.timestep_post_act",description:`<strong>timestep_post_act</strong> (<code>str</code>, <em>optional</em>, defaults to <code>None</code>) —
	The second activation function to use in timestep embedding. Choose from <code>silu</code>, <code>mish</code> and <code>gelu</code>.`,name:"timestep_post_act"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.time_cond_proj_dim",description:`<strong>time_cond_proj_dim</strong> (<code>int</code>, <em>optional</em>, defaults to <code>None</code>) —
	The dimension of <code>cond_proj</code> layer in the timestep embedding.`,name:"time_cond_proj_dim"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.conv_in_kernel",description:"<strong>conv_in_kernel</strong> (<code>int</code>, <em>optional</em>, default to <code>3</code>) — The kernel size of <code>conv_in</code> layer.",name:"conv_in_kernel"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.conv_out_kernel",description:"<strong>conv_out_kernel</strong> (<code>int</code>, <em>optional</em>, default to <code>3</code>) — The kernel size of <code>conv_out</code> layer.",name:"conv_out_kernel"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.projection_class_embeddings_input_dim",description:`<strong>projection_class_embeddings_input_dim</strong> (<code>int</code>, <em>optional</em>) — The dimension of the <code>class_labels</code> input when
	<code>class_embed_type="projection"</code>. Required when <code>class_embed_type="projection"</code>.`,name:"projection_class_embeddings_input_dim"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.class_embeddings_concat",description:`<strong>class_embeddings_concat</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) — Whether to concatenate the time
	embeddings with the class embeddings.`,name:"class_embeddings_concat"}],source:"https://github.com/huggingface/diffusers/blob/vr_11636/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py#L166"}}),we=new U({props:{name:"forward",anchor:"diffusers.AudioLDM2UNet2DConditionModel.forward",parameters:[{name:"sample",val:": Tensor"},{name:"timestep",val:": typing.Union[torch.Tensor, float, int]"},{name:"encoder_hidden_states",val:": Tensor"},{name:"class_labels",val:": typing.Optional[torch.Tensor] = None"},{name:"timestep_cond",val:": typing.Optional[torch.Tensor] = None"},{name:"attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"cross_attention_kwargs",val:": typing.Optional[typing.Dict[str, typing.Any]] = None"},{name:"encoder_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"return_dict",val:": bool = True"},{name:"encoder_hidden_states_1",val:": typing.Optional[torch.Tensor] = None"},{name:"encoder_attention_mask_1",val:": typing.Optional[torch.Tensor] = None"}],parametersDescription:[{anchor:"diffusers.AudioLDM2UNet2DConditionModel.forward.sample",description:`<strong>sample</strong> (<code>torch.Tensor</code>) —
	The noisy input tensor with the following shape <code>(batch, channel, height, width)</code>.`,name:"sample"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.forward.timestep",description:"<strong>timestep</strong> (<code>torch.Tensor</code> or <code>float</code> or <code>int</code>) — The number of timesteps to denoise an input.",name:"timestep"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.forward.encoder_hidden_states",description:`<strong>encoder_hidden_states</strong> (<code>torch.Tensor</code>) —
	The encoder hidden states with shape <code>(batch, sequence_length, feature_dim)</code>.`,name:"encoder_hidden_states"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.forward.encoder_attention_mask",description:`<strong>encoder_attention_mask</strong> (<code>torch.Tensor</code>) —
	A cross-attention mask of shape <code>(batch, sequence_length)</code> is applied to <code>encoder_hidden_states</code>. If
	<code>True</code> the mask is kept, otherwise if <code>False</code> it is discarded. Mask will be converted into a bias,
	which adds large negative values to the attention scores corresponding to “discard” tokens.`,name:"encoder_attention_mask"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.forward.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) —
	Whether or not to return a <a href="/docs/diffusers/pr_11636/en/api/models/unet2d-cond#diffusers.models.unets.unet_2d_condition.UNet2DConditionOutput">UNet2DConditionOutput</a> instead of a plain
	tuple.`,name:"return_dict"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.forward.cross_attention_kwargs",description:`<strong>cross_attention_kwargs</strong> (<code>dict</code>, <em>optional</em>) —
	A kwargs dictionary that if specified is passed along to the <code>AttnProcessor</code>.`,name:"cross_attention_kwargs"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.forward.encoder_hidden_states_1",description:`<strong>encoder_hidden_states_1</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	A second set of encoder hidden states with shape <code>(batch, sequence_length_2, feature_dim_2)</code>. Can be
	used to condition the model on a different set of embeddings to <code>encoder_hidden_states</code>.`,name:"encoder_hidden_states_1"},{anchor:"diffusers.AudioLDM2UNet2DConditionModel.forward.encoder_attention_mask_1",description:`<strong>encoder_attention_mask_1</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	A cross-attention mask of shape <code>(batch, sequence_length_2)</code> is applied to <code>encoder_hidden_states_1</code>.
	If <code>True</code> the mask is kept, otherwise if <code>False</code> it is discarded. Mask will be converted into a bias,
	which adds large negative values to the attention scores corresponding to “discard” tokens.`,name:"encoder_attention_mask_1"}],source:"https://github.com/huggingface/diffusers/blob/vr_11636/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py#L620",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>If <code>return_dict</code> is True, an <a
	href="/docs/diffusers/pr_11636/en/api/models/unet2d-cond#diffusers.models.unets.unet_2d_condition.UNet2DConditionOutput"
	>UNet2DConditionOutput</a> is returned,
	otherwise a <code>tuple</code> is returned where the first element is the sample tensor.</p>
	`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


	<p><a
	href="/docs/diffusers/pr_11636/en/api/models/unet2d-cond#diffusers.models.unets.unet_2d_condition.UNet2DConditionOutput"
	>UNet2DConditionOutput</a> or <code>tuple</code></p>
	`}}),Te=new N({props:{title:"AudioPipelineOutput",local:"diffusers.AudioPipelineOutput",headingTag:"h2"}}),xe=new U({props:{name:"class diffusers.AudioPipelineOutput",anchor:"diffusers.AudioPipelineOutput",parameters:[{name:"audios",val:": ndarray"}],parametersDescription:[{anchor:"diffusers.AudioPipelineOutput.audios",description:`<strong>audios</strong> (<code>np.ndarray</code>) —
	List of denoised audio samples of a NumPy array of shape <code>(batch_size, num_channels, sample_rate)</code>.`,name:"audios"}],source:"https://github.com/huggingface/diffusers/blob/vr_11636/src/diffusers/pipelines/pipeline_utils.py#L132"}}),je=new $o({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/audioldm2.md"}}),{c(){p=r("meta"),T=n(),M=r("p"),y=n(),c(w.$$.fragment),l=n(),c(x.$$.fragment),Qe=n(),H=r("p"),H.innerHTML=Ot,Xe=n(),V=r("p"),V.innerHTML=Yt,He=n(),z=r("p"),z.textContent=Kt,Ve=n(),F=r("p"),F.innerHTML=eo,ze=n(),R=r("p"),R.innerHTML=to,Fe=n(),c(O.$$.fragment),Re=n(),c(Y.$$.fragment),Oe=n(),K=r("p"),K.textContent=oo,Ye=n(),ee=r("p"),ee.textContent=no,Ke=n(),te=r("table"),te.innerHTML=so,et=n(),c(oe.$$.fragment),tt=n(),ne=r("ul"),ne.innerHTML=ao,ot=n(),c(se.$$.fragment),nt=n(),ae=r("ul"),ae.innerHTML=io,st=n(),c(ie.$$.fragment),at=n(),re=r("ul"),re.innerHTML=ro,it=n(),de=r("p"),de.innerHTML=lo,rt=n(),P=r("blockquote"),P.innerHTML=po,dt=n(),c(le.$$.fragment),lt=n(),b=r("div"),c(pe.$$.fragment),xt=n(),ke=r("p"),ke.textContent=co,jt=n(),Ae=r("p"),Ae.innerHTML=mo,Dt=n(),A=r("div"),c(ce.$$.fragment),kt=n(),Ue=r("p"),Ue.textContent=uo,At=n(),c(I.$$.fragment),Ut=n(),c(Z.$$.fragment),Lt=n(),G=r("div"),c(me.$$.fragment),$t=n(),Le=r("p"),Le.innerHTML=fo,Ct=n(),q=r("div"),c(ue.$$.fragment),Nt=n(),$e=r("p"),$e.innerHTML=go,Jt=n(),E=r("div"),c(fe.$$.fragment),Bt=n(),Ce=r("p"),Ce.textContent=ho,Pt=n(),C=r("div"),c(ge.$$.fragment),It=n(),Ne=r("p"),Ne.textContent=_o,Zt=n(),c(W.$$.fragment),Gt=n(),S=r("div"),c(he.$$.fragment),qt=n(),Je=r("p"),Je.textContent=Mo,pt=n(),c(_e.$$.fragment),ct=n(),L=r("div"),c(Me.$$.fragment),Et=n(),Be=r("p"),Be.innerHTML=bo,Wt=n(),Pe=r("div"),c(be.$$.fragment),mt=n(),c(ye.$$.fragment),ut=n(),k=r("div"),c(ve.$$.fragment),St=n(),Ie=r("p"),Ie.innerHTML=yo,Qt=n(),Ze=r("p"),Ze.innerHTML=vo,Xt=n(),Q=r("div"),c(we.$$.fragment),Ht=n(),Ge=r("p"),Ge.innerHTML=wo,ft=n(),c(Te.$$.fragment),gt=n(),J=r("div"),c(xe.$$.fragment),Vt=n(),qe=r("p"),qe.textContent=To,ht=n(),c(je.$$.fragment),_t=n(),Se=r("p"),this.h()},l(e){const t=Uo("svelte-u9bgzb",document.head);p=d(t,"META",{name:!0,content:!0}),t.forEach(o),T=s(e),M=d(e,"P",{}),D(M).forEach(o),y=s(e),m(w.$$.fragment,e),l=s(e),m(x.$$.fragment,e),Qe=s(e),H=d(e,"P",{"data-svelte-h":!0}),_(H)!=="svelte-g1p02e"&&(H.innerHTML=Ot),Xe=s(e),V=d(e,"P",{"data-svelte-h":!0}),_(V)!=="svelte-117z0zb"&&(V.innerHTML=Yt),He=s(e),z=d(e,"P",{"data-svelte-h":!0}),_(z)!=="svelte-wu27l3"&&(z.textContent=Kt),Ve=s(e),F=d(e,"P",{"data-svelte-h":!0}),_(F)!=="svelte-1ddltfs"&&(F.innerHTML=eo),ze=s(e),R=d(e,"P",{"data-svelte-h":!0}),_(R)!=="svelte-1gmtu0s"&&(R.innerHTML=to),Fe=s(e),m(O.$$.fragment,e),Re=s(e),m(Y.$$.fragment,e),Oe=s(e),K=d(e,"P",{"data-svelte-h":!0}),_(K)!=="svelte-1hmv6lf"&&(K.textContent=oo),Ye=s(e),ee=d(e,"P",{"data-svelte-h":!0}),_(ee)!=="svelte-1hmqwck"&&(ee.textContent=no),Ke=s(e),te=d(e,"TABLE",{"data-svelte-h":!0}),_(te)!=="svelte-bgni2t"&&(te.innerHTML=so),et=s(e),m(oe.$$.fragment,e),tt=s(e),ne=d(e,"UL",{"data-svelte-h":!0}),_(ne)!=="svelte-1ry6k60"&&(ne.innerHTML=ao),ot=s(e),m(se.$$.fragment,e),nt=s(e),ae=d(e,"UL",{"data-svelte-h":!0}),_(ae)!=="svelte-jm9a2k"&&(ae.innerHTML=io),st=s(e),m(ie.$$.fragment,e),at=s(e),re=d(e,"UL",{"data-svelte-h":!0}),_(re)!=="svelte-9fhj5v"&&(re.innerHTML=ro),it=s(e),de=d(e,"P",{"data-svelte-h":!0}),_(de)!=="svelte-16dt629"&&(de.innerHTML=lo),rt=s(e),P=d(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),_(P)!=="svelte-r1jcqf"&&(P.innerHTML=po),dt=s(e),m(le.$$.fragment,e),lt=s(e),b=d(e,"DIV",{class:!0});var v=D(b);m(pe.$$.fragment,v),xt=s(v),ke=d(v,"P",{"data-svelte-h":!0}),_(ke)!=="svelte-m23e7a"&&(ke.textContent=co),jt=s(v),Ae=d(v,"P",{"data-svelte-h":!0}),_(Ae)!=="svelte-1w7teon"&&(Ae.innerHTML=mo),Dt=s(v),A=d(v,"DIV",{class:!0});var $=D(A);m(ce.$$.fragment,$),kt=s($),Ue=d($,"P",{"data-svelte-h":!0}),_(Ue)!=="svelte-50j04k"&&(Ue.textContent=uo),At=s($),m(I.$$.fragment,$),Ut=s($),m(Z.$$.fragment,$),$.forEach(o),Lt=s(v),G=d(v,"DIV",{class:!0});var De=D(G);m(me.$$.fragment,De),$t=s(De),Le=d(De,"P",{"data-svelte-h":!0}),_(Le)!=="svelte-1s3c06i"&&(Le.innerHTML=fo),De.forEach(o),Ct=s(v),q=d(v,"DIV",{class:!0});var bt=D(q);m(ue.$$.fragment,bt),Nt=s(bt),$e=d(bt,"P",{"data-svelte-h":!0}),_($e)!=="svelte-d7vywi"&&($e.innerHTML=go),bt.forEach(o),Jt=s(v),E=d(v,"DIV",{class:!0});var yt=D(E);m(fe.$$.fragment,yt),Bt=s(yt),Ce=d(yt,"P",{"data-svelte-h":!0}),_(Ce)!=="svelte-14bnrb6"&&(Ce.textContent=ho),yt.forEach(o),Pt=s(v),C=d(v,"DIV",{class:!0});var Ee=D(C);m(ge.$$.fragment,Ee),It=s(Ee),Ne=d(Ee,"P",{"data-svelte-h":!0}),_(Ne)!=="svelte-16q0ax1"&&(Ne.textContent=_o),Zt=s(Ee),m(W.$$.fragment,Ee),Ee.forEach(o),Gt=s(v),S=d(v,"DIV",{class:!0});var vt=D(S);m(he.$$.fragment,vt),qt=s(vt),Je=d(vt,"P",{"data-svelte-h":!0}),_(Je)!=="svelte-1whwjf0"&&(Je.textContent=Mo),vt.forEach(o),v.forEach(o),pt=s(e),m(_e.$$.fragment,e),ct=s(e),L=d(e,"DIV",{class:!0});var We=D(L);m(Me.$$.fragment,We),Et=s(We),Be=d(We,"P",{"data-svelte-h":!0}),_(Be)!=="svelte-eyi7ar"&&(Be.innerHTML=bo),Wt=s(We),Pe=d(We,"DIV",{class:!0});var xo=D(Pe);m(be.$$.fragment,xo),xo.forEach(o),We.forEach(o),mt=s(e),m(ye.$$.fragment,e),ut=s(e),k=d(e,"DIV",{class:!0});var X=D(k);m(ve.$$.fragment,X),St=s(X),Ie=d(X,"P",{"data-svelte-h":!0}),_(Ie)!=="svelte-1ktc3ds"&&(Ie.innerHTML=yo),Qt=s(X),Ze=d(X,"P",{"data-svelte-h":!0}),_(Ze)!=="svelte-1o7a6vl"&&(Ze.innerHTML=vo),Xt=s(X),Q=d(X,"DIV",{class:!0});var wt=D(Q);m(we.$$.fragment,wt),Ht=s(wt),Ge=d(wt,"P",{"data-svelte-h":!0}),_(Ge)!=="svelte-166sxja"&&(Ge.innerHTML=wo),wt.forEach(o),X.forEach(o),ft=s(e),m(Te.$$.fragment,e),gt=s(e),J=d(e,"DIV",{class:!0});var Tt=D(J);m(xe.$$.fragment,Tt),Vt=s(Tt),qe=d(Tt,"P",{"data-svelte-h":!0}),_(qe)!=="svelte-19ryw33"&&(qe.textContent=To),Tt.forEach(o),ht=s(e),m(je.$$.fragment,e),_t=s(e),Se=d(e,"P",{}),D(Se).forEach(o),this.h()},h(){j(p,"name","hf:doc:metadata"),j(p,"content",Po),j(P,"class","tip"),j(A,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(G,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(q,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(E,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(C,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(S,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(b,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(Pe,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(L,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(Q,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(k,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(J,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,t){i(document.head,p),a(e,T,t),a(e,M,t),a(e,y,t),u(w,e,t),a(e,l,t),u(x,e,t),a(e,Qe,t),a(e,H,t),a(e,Xe,t),a(e,V,t),a(e,He,t),a(e,z,t),a(e,Ve,t),a(e,F,t),a(e,ze,t),a(e,R,t),a(e,Fe,t),u(O,e,t),a(e,Re,t),u(Y,e,t),a(e,Oe,t),a(e,K,t),a(e,Ye,t),a(e,ee,t),a(e,Ke,t),a(e,te,t),a(e,et,t),u(oe,e,t),a(e,tt,t),a(e,ne,t),a(e,ot,t),u(se,e,t),a(e,nt,t),a(e,ae,t),a(e,st,t),u(ie,e,t),a(e,at,t),a(e,re,t),a(e,it,t),a(e,de,t),a(e,rt,t),a(e,P,t),a(e,dt,t),u(le,e,t),a(e,lt,t),a(e,b,t),u(pe,b,null),i(b,xt),i(b,ke),i(b,jt),i(b,Ae),i(b,Dt),i(b,A),u(ce,A,null),i(A,kt),i(A,Ue),i(A,At),u(I,A,null),i(A,Ut),u(Z,A,null),i(b,Lt),i(b,G),u(me,G,null),i(G,$t),i(G,Le),i(b,Ct),i(b,q),u(ue,q,null),i(q,Nt),i(q,$e),i(b,Jt),i(b,E),u(fe,E,null),i(E,Bt),i(E,Ce),i(b,Pt),i(b,C),u(ge,C,null),i(C,It),i(C,Ne),i(C,Zt),u(W,C,null),i(b,Gt),i(b,S),u(he,S,null),i(S,qt),i(S,Je),a(e,pt,t),u(_e,e,t),a(e,ct,t),a(e,L,t),u(Me,L,null),i(L,Et),i(L,Be),i(L,Wt),i(L,Pe),u(be,Pe,null),a(e,mt,t),u(ye,e,t),a(e,ut,t),a(e,k,t),u(ve,k,null),i(k,St),i(k,Ie),i(k,Qt),i(k,Ze),i(k,Xt),i(k,Q),u(we,Q,null),i(Q,Ht),i(Q,Ge),a(e,ft,t),u(Te,e,t),a(e,gt,t),a(e,J,t),u(xe,J,null),i(J,Vt),i(J,qe),a(e,ht,t),u(je,e,t),a(e,_t,t),a(e,Se,t),Mt=!0},p(e,[t]){const v={};t&2&&(v.$$scope={dirty:t,ctx:e}),I.$set(v);const $={};t&2&&($.$$scope={dirty:t,ctx:e}),Z.$set($);const De={};t&2&&(De.$$scope={dirty:t,ctx:e}),W.$set(De)},i(e){Mt\|\|(f(w.$$.fragment,e),f(x.$$.fragment,e),f(O.$$.fragment,e),f(Y.$$.fragment,e),f(oe.$$.fragment,e),f(se.$$.fragment,e),f(ie.$$.fragment,e),f(le.$$.fragment,e),f(pe.$$.fragment,e),f(ce.$$.fragment,e),f(I.$$.fragment,e),f(Z.$$.fragment,e),f(me.$$.fragment,e),f(ue.$$.fragment,e),f(fe.$$.fragment,e),f(ge.$$.fragment,e),f(W.$$.fragment,e),f(he.$$.fragment,e),f(_e.$$.fragment,e),f(Me.$$.fragment,e),f(be.$$.fragment,e),f(ye.$$.fragment,e),f(ve.$$.fragment,e),f(we.$$.fragment,e),f(Te.$$.fragment,e),f(xe.$$.fragment,e),f(je.$$.fragment,e),Mt=!0)},o(e){g(w.$$.fragment,e),g(x.$$.fragment,e),g(O.$$.fragment,e),g(Y.$$.fragment,e),g(oe.$$.fragment,e),g(se.$$.fragment,e),g(ie.$$.fragment,e),g(le.$$.fragment,e),g(pe.$$.fragment,e),g(ce.$$.fragment,e),g(I.$$.fragment,e),g(Z.$$.fragment,e),g(me.$$.fragment,e),g(ue.$$.fragment,e),g(fe.$$.fragment,e),g(ge.$$.fragment,e),g(W.$$.fragment,e),g(he.$$.fragment,e),g(_e.$$.fragment,e),g(Me.$$.fragment,e),g(be.$$.fragment,e),g(ye.$$.fragment,e),g(ve.$$.fragment,e),g(we.$$.fragment,e),g(Te.$$.fragment,e),g(xe.$$.fragment,e),g(je.$$.fragment,e),Mt=!1},d(e){e&&(o(T),o(M),o(y),o(l),o(Qe),o(H),o(Xe),o(V),o(He),o(z),o(Ve),o(F),o(ze),o(R),o(Fe),o(Re),o(Oe),o(K),o(Ye),o(ee),o(Ke),o(te),o(et),o(tt),o(ne),o(ot),o(nt),o(ae),o(st),o(at),o(re),o(it),o(de),o(rt),o(P),o(dt),o(lt),o(b),o(pt),o(ct),o(L),o(mt),o(ut),o(k),o(ft),o(gt),o(J),o(ht),o(_t),o(Se)),o(p),h(w,e),h(x,e),h(O,e),h(Y,e),h(oe,e),h(se,e),h(ie,e),h(le,e),h(pe),h(ce),h(I),h(Z),h(me),h(ue),h(fe),h(ge),h(W),h(he),h(_e,e),h(Me),h(be),h(ye,e),h(ve),h(we),h(Te,e),h(xe),h(je,e)}}}const Po='{"title":"AudioLDM 2","local":"audioldm-2","sections":[{"title":"Tips","local":"tips","sections":[{"title":"Choosing a checkpoint","local":"choosing-a-checkpoint","sections":[],"depth":3},{"title":"Constructing a prompt","local":"constructing-a-prompt","sections":[],"depth":3},{"title":"Controlling inference","local":"controlling-inference","sections":[],"depth":3},{"title":"Evaluating generated waveforms:","local":"evaluating-generated-waveforms","sections":[],"depth":3}],"depth":2},{"title":"AudioLDM2Pipeline","local":"diffusers.AudioLDM2Pipeline","sections":[],"depth":2},{"title":"AudioLDM2ProjectionModel","local":"diffusers.AudioLDM2ProjectionModel","sections":[],"depth":2},{"title":"AudioLDM2UNet2DConditionModel","local":"diffusers.AudioLDM2UNet2DConditionModel","sections":[],"depth":2},{"title":"AudioPipelineOutput","local":"diffusers.AudioPipelineOutput","sections":[],"depth":2}],"depth":1}';function Io(B){return Do(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Xo extends ko{constructor(p){super(),Ao(this,p,Io,Bo,jo,{})}}export{Xo as component};

Xet Storage Details

Size:: 82.1 kB
Xet hash:: 78b77ef1149a3bf6ca420e57dcdb4ee3aa2293d8af9202e11ea3127bbd465a90

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.