Buckets:

hf-doc-build
/

doc

hf-doc-build/doc / diffusers /v0.22.1 /en /_app /immutable /nodes /22.a3fbb9d3.js

rtrm's picture

about 2 months ago

12.4 kB

	import{s as me,n as pe,o as fe}from"../chunks/scheduler.182ea377.js";import{S as he,i as ge,g as d,s,r as T,A as _e,h as i,f as t,c as r,j as A,u as k,x as Q,k as P,y as l,a as n,v as E,d as D,t as O,w as L}from"../chunks/index.abf12888.js";import{D as oe}from"../chunks/Docstring.93f6f462.js";import{H as ne}from"../chunks/Heading.16916d63.js";function ve(se){let c,H,C,I,f,N,h,re='The VQ-VAE model was introduced in <a href="https://huggingface.co/papers/1711.00937" rel="nofollow">Neural Discrete Representation Learning</a> by Aaron van den Oord, Oriol Vinyals and Koray Kavukcuoglu. The model is used in 🤗 Diffusers to decode latent representations into images. Unlike <a href="/docs/diffusers/v0.22.1/en/api/models/autoencoderkl#diffusers.AutoencoderKL">AutoencoderKL</a>, the <a href="/docs/diffusers/v0.22.1/en/api/models/vq#diffusers.VQModel">VQModel</a> works in a quantized latent space.',F,g,ae="The abstract from the paper is:",S,_,de="<em>Learning useful representations without supervision remains a key challenge in machine learning. In this paper, we propose a simple yet powerful generative model that learns such discrete representations. Our model, the Vector Quantised-Variational AutoEncoder (VQ-VAE), differs from VAEs in two key ways: the encoder network outputs discrete, rather than continuous, codes; and the prior is learnt rather than static. In order to learn a discrete latent representation, we incorporate ideas from vector quantisation (VQ). Using the VQ method allows the model to circumvent issues of “posterior collapse” — where the latents are ignored when they are paired with a powerful autoregressive decoder — typically observed in the VAE framework. Pairing these representations with an autoregressive prior, the model can generate high quality images, videos, and speech as well as doing high quality speaker conversion and unsupervised learning of phonemes, providing further evidence of the utility of the learnt representations.</em>",U,v,B,a,b,X,$,ie="A VQ-VAE model for decoding latent representations.",Y,x,le=`This model inherits from <a href="/docs/diffusers/v0.22.1/en/api/models/overview#diffusers.ModelMixin">ModelMixin</a>. Check the superclass documentation for it’s generic methods implemented
	for all models (such as downloading or saving).`,Z,m,V,ee,M,ce='The <a href="/docs/diffusers/v0.22.1/en/api/models/vq#diffusers.VQModel">VQModel</a> forward method.',K,w,R,u,y,te,q,ue="Output of VQModel encoding method.",j,z,W;return f=new ne({props:{title:"VQModel",local:"vqmodel",headingTag:"h1"}}),v=new ne({props:{title:"VQModel",local:"diffusers.VQModel",headingTag:"h2"}}),b=new oe({props:{name:"class diffusers.VQModel",anchor:"diffusers.VQModel",parameters:[{name:"in_channels",val:": int = 3"},{name:"out_channels",val:": int = 3"},{name:"down_block_types",val:": typing.Tuple[str, ...] = ('DownEncoderBlock2D',)"},{name:"up_block_types",val:": typing.Tuple[str, ...] = ('UpDecoderBlock2D',)"},{name:"block_out_channels",val:": typing.Tuple[int, ...] = (64,)"},{name:"layers_per_block",val:": int = 1"},{name:"act_fn",val:": str = 'silu'"},{name:"latent_channels",val:": int = 3"},{name:"sample_size",val:": int = 32"},{name:"num_vq_embeddings",val:": int = 256"},{name:"norm_num_groups",val:": int = 32"},{name:"vq_embed_dim",val:": typing.Optional[int] = None"},{name:"scaling_factor",val:": float = 0.18215"},{name:"norm_type",val:": str = 'group'"}],parametersDescription:[{anchor:"diffusers.VQModel.in_channels",description:"<strong>in_channels</strong> (int, <em>optional</em>, defaults to 3) — Number of channels in the input image.",name:"in_channels"},{anchor:"diffusers.VQModel.out_channels",description:"<strong>out_channels</strong> (int, <em>optional</em>, defaults to 3) — Number of channels in the output.",name:"out_channels"},{anchor:"diffusers.VQModel.down_block_types",description:`<strong>down_block_types</strong> (<code>Tuple[str]</code>, <em>optional</em>, defaults to <code>("DownEncoderBlock2D",)</code>) —
	Tuple of downsample block types.`,name:"down_block_types"},{anchor:"diffusers.VQModel.up_block_types",description:`<strong>up_block_types</strong> (<code>Tuple[str]</code>, <em>optional</em>, defaults to <code>("UpDecoderBlock2D",)</code>) —
	Tuple of upsample block types.`,name:"up_block_types"},{anchor:"diffusers.VQModel.block_out_channels",description:`<strong>block_out_channels</strong> (<code>Tuple[int]</code>, <em>optional</em>, defaults to <code>(64,)</code>) —
	Tuple of block output channels.`,name:"block_out_channels"},{anchor:"diffusers.VQModel.layers_per_block",description:"<strong>layers_per_block</strong> (<code>int</code>, <em>optional</em>, defaults to <code>1</code>) — Number of layers per block.",name:"layers_per_block"},{anchor:"diffusers.VQModel.act_fn",description:"<strong>act_fn</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"silu"</code>) — The activation function to use.",name:"act_fn"},{anchor:"diffusers.VQModel.latent_channels",description:"<strong>latent_channels</strong> (<code>int</code>, <em>optional</em>, defaults to <code>3</code>) — Number of channels in the latent space.",name:"latent_channels"},{anchor:"diffusers.VQModel.sample_size",description:"<strong>sample_size</strong> (<code>int</code>, <em>optional</em>, defaults to <code>32</code>) — Sample input size.",name:"sample_size"},{anchor:"diffusers.VQModel.num_vq_embeddings",description:"<strong>num_vq_embeddings</strong> (<code>int</code>, <em>optional</em>, defaults to <code>256</code>) — Number of codebook vectors in the VQ-VAE.",name:"num_vq_embeddings"},{anchor:"diffusers.VQModel.norm_num_groups",description:"<strong>norm_num_groups</strong> (<code>int</code>, <em>optional</em>, defaults to <code>32</code>) — Number of groups for normalization layers.",name:"norm_num_groups"},{anchor:"diffusers.VQModel.vq_embed_dim",description:"<strong>vq_embed_dim</strong> (<code>int</code>, <em>optional</em>) — Hidden dim of codebook vectors in the VQ-VAE.",name:"vq_embed_dim"},{anchor:"diffusers.VQModel.scaling_factor",description:`<strong>scaling_factor</strong> (<code>float</code>, <em>optional</em>, defaults to <code>0.18215</code>) —
	The component-wise standard deviation of the trained latent space computed using the first batch of the
	training set. This is used to scale the latent space to have unit variance when training the diffusion
	model. The latents are scaled with the formula <code>z = z * scaling_factor</code> before being passed to the
	diffusion model. When decoding, the latents are scaled back to the original scale with the formula: <code>z = 1 / scaling_factor * z</code>. For more details, refer to sections 4.3.2 and D.1 of the <a href="https://arxiv.org/abs/2112.10752" rel="nofollow">High-Resolution Image
	Synthesis with Latent Diffusion Models</a> paper.`,name:"scaling_factor"},{anchor:"diffusers.VQModel.norm_type",description:`<strong>norm_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"group"</code>) —
	Type of normalization layer to use. Can be one of <code>"group"</code> or <code>"spatial"</code>.`,name:"norm_type"}],source:"https://github.com/huggingface/diffusers/blob/v0.22.1/src/diffusers/models/vq_model.py#L40"}}),V=new oe({props:{name:"forward",anchor:"diffusers.VQModel.forward",parameters:[{name:"sample",val:": FloatTensor"},{name:"return_dict",val:": bool = True"}],parametersDescription:[{anchor:"diffusers.VQModel.forward.sample",description:"<strong>sample</strong> (<code>torch.FloatTensor</code>) — Input sample.",name:"sample"},{anchor:"diffusers.VQModel.forward.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) —
	Whether or not to return a <a href="/docs/diffusers/v0.22.1/en/api/models/vq#diffusers.models.vq_model.VQEncoderOutput">models.vq_model.VQEncoderOutput</a> instead of a plain tuple.`,name:"return_dict"}],source:"https://github.com/huggingface/diffusers/blob/v0.22.1/src/diffusers/models/vq_model.py#L151",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>If return_dict is True, a <a
	href="/docs/diffusers/v0.22.1/en/api/models/vq#diffusers.models.vq_model.VQEncoderOutput"
	>VQEncoderOutput</a> is returned, otherwise a plain <code>tuple</code>
	is returned.</p>
	`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


	<p><a
	href="/docs/diffusers/v0.22.1/en/api/models/vq#diffusers.models.vq_model.VQEncoderOutput"
	>VQEncoderOutput</a> or <code>tuple</code></p>
	`}}),w=new ne({props:{title:"VQEncoderOutput",local:"diffusers.models.vq_model.VQEncoderOutput",headingTag:"h2"}}),y=new oe({props:{name:"class diffusers.models.vq_model.VQEncoderOutput",anchor:"diffusers.models.vq_model.VQEncoderOutput",parameters:[{name:"latents",val:": FloatTensor"}],parametersDescription:[{anchor:"diffusers.models.vq_model.VQEncoderOutput.latents",description:`<strong>latents</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, num_channels, height, width)</code>) —
	The encoded output sample from the last layer of the model.`,name:"latents"}],source:"https://github.com/huggingface/diffusers/blob/v0.22.1/src/diffusers/models/vq_model.py#L28"}}),{c(){c=d("meta"),H=s(),C=d("p"),I=s(),T(f.$$.fragment),N=s(),h=d("p"),h.innerHTML=re,F=s(),g=d("p"),g.textContent=ae,S=s(),_=d("p"),_.innerHTML=de,U=s(),T(v.$$.fragment),B=s(),a=d("div"),T(b.$$.fragment),X=s(),$=d("p"),$.textContent=ie,Y=s(),x=d("p"),x.innerHTML=le,Z=s(),m=d("div"),T(V.$$.fragment),ee=s(),M=d("p"),M.innerHTML=ce,K=s(),T(w.$$.fragment),R=s(),u=d("div"),T(y.$$.fragment),te=s(),q=d("p"),q.textContent=ue,j=s(),z=d("p"),this.h()},l(e){const o=_e("svelte-u9bgzb",document.head);c=i(o,"META",{name:!0,content:!0}),o.forEach(t),H=r(e),C=i(e,"P",{}),A(C).forEach(t),I=r(e),k(f.$$.fragment,e),N=r(e),h=i(e,"P",{"data-svelte-h":!0}),Q(h)!=="svelte-1gx8i7b"&&(h.innerHTML=re),F=r(e),g=i(e,"P",{"data-svelte-h":!0}),Q(g)!=="svelte-1cwsb16"&&(g.textContent=ae),S=r(e),_=i(e,"P",{"data-svelte-h":!0}),Q(_)!=="svelte-331jgs"&&(_.innerHTML=de),U=r(e),k(v.$$.fragment,e),B=r(e),a=i(e,"DIV",{class:!0});var p=A(a);k(b.$$.fragment,p),X=r(p),$=i(p,"P",{"data-svelte-h":!0}),Q($)!=="svelte-1fe1n0y"&&($.textContent=ie),Y=r(p),x=i(p,"P",{"data-svelte-h":!0}),Q(x)!=="svelte-66wfcc"&&(x.innerHTML=le),Z=r(p),m=i(p,"DIV",{class:!0});var G=A(m);k(V.$$.fragment,G),ee=r(G),M=i(G,"P",{"data-svelte-h":!0}),Q(M)!=="svelte-pef85g"&&(M.innerHTML=ce),G.forEach(t),p.forEach(t),K=r(e),k(w.$$.fragment,e),R=r(e),u=i(e,"DIV",{class:!0});var J=A(u);k(y.$$.fragment,J),te=r(J),q=i(J,"P",{"data-svelte-h":!0}),Q(q)!=="svelte-1u2gsdw"&&(q.textContent=ue),J.forEach(t),j=r(e),z=i(e,"P",{}),A(z).forEach(t),this.h()},h(){P(c,"name","hf:doc:metadata"),P(c,"content",be),P(m,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),P(a,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),P(u,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,o){l(document.head,c),n(e,H,o),n(e,C,o),n(e,I,o),E(f,e,o),n(e,N,o),n(e,h,o),n(e,F,o),n(e,g,o),n(e,S,o),n(e,_,o),n(e,U,o),E(v,e,o),n(e,B,o),n(e,a,o),E(b,a,null),l(a,X),l(a,$),l(a,Y),l(a,x),l(a,Z),l(a,m),E(V,m,null),l(m,ee),l(m,M),n(e,K,o),E(w,e,o),n(e,R,o),n(e,u,o),E(y,u,null),l(u,te),l(u,q),n(e,j,o),n(e,z,o),W=!0},p:pe,i(e){W\|\|(D(f.$$.fragment,e),D(v.$$.fragment,e),D(b.$$.fragment,e),D(V.$$.fragment,e),D(w.$$.fragment,e),D(y.$$.fragment,e),W=!0)},o(e){O(f.$$.fragment,e),O(v.$$.fragment,e),O(b.$$.fragment,e),O(V.$$.fragment,e),O(w.$$.fragment,e),O(y.$$.fragment,e),W=!1},d(e){e&&(t(H),t(C),t(I),t(N),t(h),t(F),t(g),t(S),t(_),t(U),t(B),t(a),t(K),t(R),t(u),t(j),t(z)),t(c),L(f,e),L(v,e),L(b),L(V),L(w,e),L(y)}}}const be='{"title":"VQModel","local":"vqmodel","sections":[{"title":"VQModel","local":"diffusers.VQModel","sections":[],"depth":2},{"title":"VQEncoderOutput","local":"diffusers.models.vq_model.VQEncoderOutput","sections":[],"depth":2}],"depth":1}';function Ve(se){return fe(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class xe extends he{constructor(c){super(),ge(this,c,Ve,ve,me,{})}}export{xe as component};

Xet Storage Details

Size:: 12.4 kB
Xet hash:: 029b15bd0cfb238149c489571ba9b111d50bd2c6bfaad144b6609f2a37c4d035

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.