Buckets:

hf-doc-build
/

doc

hf-doc-build/doc / diffusers /v0.23.0 /en /_app /immutable /nodes /16.591fc8ff.js

rtrm's picture

about 2 months ago

16.1 kB

	import{s as ge,o as Te,n as be}from"../chunks/scheduler.182ea377.js";import{S as ve,i as $e,g as i,s as r,r as O,A as De,h as d,f as n,c as a,j as V,u as C,x as u,k as j,y as p,a as o,v as N,d as P,t as H,w as A}from"../chunks/index.abf12888.js";import{T as Me}from"../chunks/Tip.230e2334.js";import{D as ie}from"../chunks/Docstring.93f6f462.js";import{H as de}from"../chunks/Heading.16916d63.js";function ye(U){let s,g="It is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised image don’t contain a prediction for the masked pixel because the unnoised image cannot be masked.";return{c(){s=i("p"),s.textContent=g},l(l){s=d(l,"P",{"data-svelte-h":!0}),u(s)!=="svelte-3w1alv"&&(s.textContent=g)},m(l,z){o(l,s,z)},p:be,d(l){l&&n(s)}}}function xe(U){let s,g,l,z,T,S,b,le='A Transformer model for image-like data from <a href="https://huggingface.co/CompVis" rel="nofollow">CompVis</a> that is based on the <a href="https://huggingface.co/papers/2010.11929" rel="nofollow">Vision Transformer</a> introduced by Dosovitskiy et al. The <a href="/docs/diffusers/v0.23.0/en/api/models/transformer2d#diffusers.Transformer2DModel">Transformer2DModel</a> accepts discrete (classes of vector embeddings) or continuous (actual embeddings) inputs.',W,v,me="When the input is <strong>continuous</strong>:",B,$,ce="<li>Project the input and reshape it to <code>(batch_size, sequence_length, feature_dimension)</code>.</li> <li>Apply the Transformer blocks in the standard way.</li> <li>Reshape to image.</li>",R,D,fe="When the input is <strong>discrete</strong>:",Z,h,G,M,pe="<li>Convert input (classes of latent pixels) to embeddings and apply positional embeddings.</li> <li>Apply the Transformer blocks in the standard way.</li> <li>Predict classes of unnoised image.</li>",J,y,K,m,x,oe,I,ue="A 2D Transformer model for image-like data.",se,_,w,re,q,he='The <a href="/docs/diffusers/v0.23.0/en/api/models/transformer2d#diffusers.Transformer2DModel">Transformer2DModel</a> forward method.',Q,k,X,c,L,ae,F,_e='The output of <a href="/docs/diffusers/v0.23.0/en/api/models/transformer2d#diffusers.Transformer2DModel">Transformer2DModel</a>.',Y,E,ee;return T=new de({props:{title:"Transformer2D",local:"transformer2d",headingTag:"h1"}}),h=new Me({props:{$$slots:{default:[ye]},$$scope:{ctx:U}}}),y=new de({props:{title:"Transformer2DModel",local:"diffusers.Transformer2DModel",headingTag:"h2"}}),x=new ie({props:{name:"class diffusers.Transformer2DModel",anchor:"diffusers.Transformer2DModel",parameters:[{name:"num_attention_heads",val:": int = 16"},{name:"attention_head_dim",val:": int = 88"},{name:"in_channels",val:": typing.Optional[int] = None"},{name:"out_channels",val:": typing.Optional[int] = None"},{name:"num_layers",val:": int = 1"},{name:"dropout",val:": float = 0.0"},{name:"norm_num_groups",val:": int = 32"},{name:"cross_attention_dim",val:": typing.Optional[int] = None"},{name:"attention_bias",val:": bool = False"},{name:"sample_size",val:": typing.Optional[int] = None"},{name:"num_vector_embeds",val:": typing.Optional[int] = None"},{name:"patch_size",val:": typing.Optional[int] = None"},{name:"activation_fn",val:": str = 'geglu'"},{name:"num_embeds_ada_norm",val:": typing.Optional[int] = None"},{name:"use_linear_projection",val:": bool = False"},{name:"only_cross_attention",val:": bool = False"},{name:"double_self_attention",val:": bool = False"},{name:"upcast_attention",val:": bool = False"},{name:"norm_type",val:": str = 'layer_norm'"},{name:"norm_elementwise_affine",val:": bool = True"},{name:"norm_eps",val:": float = 1e-05"},{name:"attention_type",val:": str = 'default'"},{name:"caption_channels",val:": int = None"}],parametersDescription:[{anchor:"diffusers.Transformer2DModel.num_attention_heads",description:"<strong>num_attention_heads</strong> (<code>int</code>, <em>optional</em>, defaults to 16) — The number of heads to use for multi-head attention.",name:"num_attention_heads"},{anchor:"diffusers.Transformer2DModel.attention_head_dim",description:"<strong>attention_head_dim</strong> (<code>int</code>, <em>optional</em>, defaults to 88) — The number of channels in each head.",name:"attention_head_dim"},{anchor:"diffusers.Transformer2DModel.in_channels",description:`<strong>in_channels</strong> (<code>int</code>, <em>optional</em>) —
	The number of channels in the input and output (specify if the input is <strong>continuous</strong>).`,name:"in_channels"},{anchor:"diffusers.Transformer2DModel.num_layers",description:"<strong>num_layers</strong> (<code>int</code>, <em>optional</em>, defaults to 1) — The number of layers of Transformer blocks to use.",name:"num_layers"},{anchor:"diffusers.Transformer2DModel.dropout",description:"<strong>dropout</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) — The dropout probability to use.",name:"dropout"},{anchor:"diffusers.Transformer2DModel.cross_attention_dim",description:"<strong>cross_attention_dim</strong> (<code>int</code>, <em>optional</em>) — The number of <code>encoder_hidden_states</code> dimensions to use.",name:"cross_attention_dim"},{anchor:"diffusers.Transformer2DModel.sample_size",description:`<strong>sample_size</strong> (<code>int</code>, <em>optional</em>) — The width of the latent images (specify if the input is <strong>discrete</strong>).
	This is fixed during training since it is used to learn a number of position embeddings.`,name:"sample_size"},{anchor:"diffusers.Transformer2DModel.num_vector_embeds",description:`<strong>num_vector_embeds</strong> (<code>int</code>, <em>optional</em>) —
	The number of classes of the vector embeddings of the latent pixels (specify if the input is <strong>discrete</strong>).
	Includes the class for the masked latent pixel.`,name:"num_vector_embeds"},{anchor:"diffusers.Transformer2DModel.activation_fn",description:"<strong>activation_fn</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"geglu"</code>) — Activation function to use in feed-forward.",name:"activation_fn"},{anchor:"diffusers.Transformer2DModel.num_embeds_ada_norm",description:`<strong>num_embeds_ada_norm</strong> ( <code>int</code>, <em>optional</em>) —
	The number of diffusion steps used during training. Pass if at least one of the norm_layers is
	<code>AdaLayerNorm</code>. This is fixed during training since it is used to learn a number of embeddings that are
	added to the hidden states.</p>
	<p>During inference, you can denoise for up to but not more steps than <code>num_embeds_ada_norm</code>.`,name:"num_embeds_ada_norm"},{anchor:"diffusers.Transformer2DModel.attention_bias",description:`<strong>attention_bias</strong> (<code>bool</code>, <em>optional</em>) —
	Configure if the <code>TransformerBlocks</code> attention should contain a bias parameter.`,name:"attention_bias"}],source:"https://github.com/huggingface/diffusers/blob/v0.23.0/src/diffusers/models/transformer_2d.py#L45"}}),w=new ie({props:{name:"forward",anchor:"diffusers.Transformer2DModel.forward",parameters:[{name:"hidden_states",val:": Tensor"},{name:"encoder_hidden_states",val:": typing.Optional[torch.Tensor] = None"},{name:"timestep",val:": typing.Optional[torch.LongTensor] = None"},{name:"added_cond_kwargs",val:": typing.Dict[str, torch.Tensor] = None"},{name:"class_labels",val:": typing.Optional[torch.LongTensor] = None"},{name:"cross_attention_kwargs",val:": typing.Dict[str, typing.Any] = None"},{name:"attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"encoder_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"return_dict",val:": bool = True"}],parametersDescription:[{anchor:"diffusers.Transformer2DModel.forward.hidden_states",description:`<strong>hidden_states</strong> (<code>torch.LongTensor</code> of shape <code>(batch size, num latent pixels)</code> if discrete, <code>torch.FloatTensor</code> of shape <code>(batch size, channel, height, width)</code> if continuous) —
	Input <code>hidden_states</code>.`,name:"hidden_states"},{anchor:"diffusers.Transformer2DModel.forward.encoder_hidden_states",description:`<strong>encoder_hidden_states</strong> ( <code>torch.FloatTensor</code> of shape <code>(batch size, sequence len, embed dims)</code>, <em>optional</em>) —
	Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
	self-attention.`,name:"encoder_hidden_states"},{anchor:"diffusers.Transformer2DModel.forward.timestep",description:`<strong>timestep</strong> ( <code>torch.LongTensor</code>, <em>optional</em>) —
	Used to indicate denoising step. Optional timestep to be applied as an embedding in <code>AdaLayerNorm</code>.`,name:"timestep"},{anchor:"diffusers.Transformer2DModel.forward.class_labels",description:`<strong>class_labels</strong> ( <code>torch.LongTensor</code> of shape <code>(batch size, num classes)</code>, <em>optional</em>) —
	Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
	<code>AdaLayerZeroNorm</code>.`,name:"class_labels"},{anchor:"diffusers.Transformer2DModel.forward.cross_attention_kwargs",description:`<strong>cross_attention_kwargs</strong> ( <code>Dict[str, Any]</code>, <em>optional</em>) —
	A kwargs dictionary that if specified is passed along to the <code>AttentionProcessor</code> as defined under
	<code>self.processor</code> in
	<a href="https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py" rel="nofollow">diffusers.models.attention_processor</a>.`,name:"cross_attention_kwargs"},{anchor:"diffusers.Transformer2DModel.forward.attention_mask",description:`<strong>attention_mask</strong> ( <code>torch.Tensor</code>, <em>optional</em>) —
	An attention mask of shape <code>(batch, key_tokens)</code> is applied to <code>encoder_hidden_states</code>. If <code>1</code> the mask
	is kept, otherwise if <code>0</code> it is discarded. Mask will be converted into a bias, which adds large
	negative values to the attention scores corresponding to “discard” tokens.`,name:"attention_mask"},{anchor:"diffusers.Transformer2DModel.forward.encoder_attention_mask",description:`<strong>encoder_attention_mask</strong> ( <code>torch.Tensor</code>, <em>optional</em>) —
	Cross-attention mask applied to <code>encoder_hidden_states</code>. Two formats supported:</p>
	<ul>
	<li>Mask <code>(batch, sequence_length)</code> True = keep, False = discard.</li>
	<li>Bias <code>(batch, 1, sequence_length)</code> 0 = keep, -10000 = discard.</li>
	</ul>
	<p>If <code>ndim == 2</code>: will be interpreted as a mask, then converted into a bias consistent with the format
	above. This bias will be added to the cross-attention scores.`,name:"encoder_attention_mask"},{anchor:"diffusers.Transformer2DModel.forward.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) —
	Whether or not to return a <a href="/docs/diffusers/v0.23.0/en/api/models/unet2d-cond#diffusers.models.unet_2d_condition.UNet2DConditionOutput">UNet2DConditionOutput</a> instead of a plain
	tuple.`,name:"return_dict"}],source:"https://github.com/huggingface/diffusers/blob/v0.23.0/src/diffusers/models/transformer_2d.py#L240",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>If <code>return_dict</code> is True, an <a
	href="/docs/diffusers/v0.23.0/en/api/models/transformer2d#diffusers.models.transformer_2d.Transformer2DModelOutput"
	>Transformer2DModelOutput</a> is returned, otherwise a
	<code>tuple</code> where the first element is the sample tensor.</p>
	`}}),k=new de({props:{title:"Transformer2DModelOutput",local:"diffusers.models.transformer_2d.Transformer2DModelOutput",headingTag:"h2"}}),L=new ie({props:{name:"class diffusers.models.transformer_2d.Transformer2DModelOutput",anchor:"diffusers.models.transformer_2d.Transformer2DModelOutput",parameters:[{name:"sample",val:": FloatTensor"}],parametersDescription:[{anchor:"diffusers.models.transformer_2d.Transformer2DModelOutput.sample",description:`<strong>sample</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, num_channels, height, width)</code> or <code>(batch size, num_vector_embeds - 1, num_latent_pixels)</code> if <a href="/docs/diffusers/v0.23.0/en/api/models/transformer2d#diffusers.Transformer2DModel">Transformer2DModel</a> is discrete) —
	The hidden states output conditioned on the <code>encoder_hidden_states</code> input. If discrete, returns probability
	distributions for the unnoised latent pixels.`,name:"sample"}],source:"https://github.com/huggingface/diffusers/blob/v0.23.0/src/diffusers/models/transformer_2d.py#L32"}}),{c(){s=i("meta"),g=r(),l=i("p"),z=r(),O(T.$$.fragment),S=r(),b=i("p"),b.innerHTML=le,W=r(),v=i("p"),v.innerHTML=me,B=r(),$=i("ol"),$.innerHTML=ce,R=r(),D=i("p"),D.innerHTML=fe,Z=r(),O(h.$$.fragment),G=r(),M=i("ol"),M.innerHTML=pe,J=r(),O(y.$$.fragment),K=r(),m=i("div"),O(x.$$.fragment),oe=r(),I=i("p"),I.textContent=ue,se=r(),_=i("div"),O(w.$$.fragment),re=r(),q=i("p"),q.innerHTML=he,Q=r(),O(k.$$.fragment),X=r(),c=i("div"),O(L.$$.fragment),ae=r(),F=i("p"),F.innerHTML=_e,Y=r(),E=i("p"),this.h()},l(e){const t=De("svelte-u9bgzb",document.head);s=d(t,"META",{name:!0,content:!0}),t.forEach(n),g=a(e),l=d(e,"P",{}),V(l).forEach(n),z=a(e),C(T.$$.fragment,e),S=a(e),b=d(e,"P",{"data-svelte-h":!0}),u(b)!=="svelte-1p1gwo8"&&(b.innerHTML=le),W=a(e),v=d(e,"P",{"data-svelte-h":!0}),u(v)!=="svelte-ytlpm7"&&(v.innerHTML=me),B=a(e),$=d(e,"OL",{"data-svelte-h":!0}),u($)!=="svelte-10ra9yx"&&($.innerHTML=ce),R=a(e),D=d(e,"P",{"data-svelte-h":!0}),u(D)!=="svelte-1wqmwav"&&(D.innerHTML=fe),Z=a(e),C(h.$$.fragment,e),G=a(e),M=d(e,"OL",{"data-svelte-h":!0}),u(M)!=="svelte-m2jel9"&&(M.innerHTML=pe),J=a(e),C(y.$$.fragment,e),K=a(e),m=d(e,"DIV",{class:!0});var f=V(m);C(x.$$.fragment,f),oe=a(f),I=d(f,"P",{"data-svelte-h":!0}),u(I)!=="svelte-1dpkeub"&&(I.textContent=ue),se=a(f),_=d(f,"DIV",{class:!0});var te=V(_);C(w.$$.fragment,te),re=a(te),q=d(te,"P",{"data-svelte-h":!0}),u(q)!=="svelte-m1qjr6"&&(q.innerHTML=he),te.forEach(n),f.forEach(n),Q=a(e),C(k.$$.fragment,e),X=a(e),c=d(e,"DIV",{class:!0});var ne=V(c);C(L.$$.fragment,ne),ae=a(ne),F=d(ne,"P",{"data-svelte-h":!0}),u(F)!=="svelte-ip8esq"&&(F.innerHTML=_e),ne.forEach(n),Y=a(e),E=d(e,"P",{}),V(E).forEach(n),this.h()},h(){j(s,"name","hf:doc:metadata"),j(s,"content",we),j(_,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(m,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(c,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,t){p(document.head,s),o(e,g,t),o(e,l,t),o(e,z,t),N(T,e,t),o(e,S,t),o(e,b,t),o(e,W,t),o(e,v,t),o(e,B,t),o(e,$,t),o(e,R,t),o(e,D,t),o(e,Z,t),N(h,e,t),o(e,G,t),o(e,M,t),o(e,J,t),N(y,e,t),o(e,K,t),o(e,m,t),N(x,m,null),p(m,oe),p(m,I),p(m,se),p(m,_),N(w,_,null),p(_,re),p(_,q),o(e,Q,t),N(k,e,t),o(e,X,t),o(e,c,t),N(L,c,null),p(c,ae),p(c,F),o(e,Y,t),o(e,E,t),ee=!0},p(e,[t]){const f={};t&2&&(f.$$scope={dirty:t,ctx:e}),h.$set(f)},i(e){ee\|\|(P(T.$$.fragment,e),P(h.$$.fragment,e),P(y.$$.fragment,e),P(x.$$.fragment,e),P(w.$$.fragment,e),P(k.$$.fragment,e),P(L.$$.fragment,e),ee=!0)},o(e){H(T.$$.fragment,e),H(h.$$.fragment,e),H(y.$$.fragment,e),H(x.$$.fragment,e),H(w.$$.fragment,e),H(k.$$.fragment,e),H(L.$$.fragment,e),ee=!1},d(e){e&&(n(g),n(l),n(z),n(S),n(b),n(W),n(v),n(B),n($),n(R),n(D),n(Z),n(G),n(M),n(J),n(K),n(m),n(Q),n(X),n(c),n(Y),n(E)),n(s),A(T,e),A(h,e),A(y,e),A(x),A(w),A(k,e),A(L)}}}const we='{"title":"Transformer2D","local":"transformer2d","sections":[{"title":"Transformer2DModel","local":"diffusers.Transformer2DModel","sections":[],"depth":2},{"title":"Transformer2DModelOutput","local":"diffusers.models.transformer_2d.Transformer2DModelOutput","sections":[],"depth":2}],"depth":1}';function ke(U){return Te(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class He extends ve{constructor(s){super(),$e(this,s,ke,xe,ge,{})}}export{He as component};

Xet Storage Details

Size:: 16.1 kB
Xet hash:: 4f1f30133acac3d5b48fa53cc597c0f1a42ce8bdaba352a22a009edb3be48192

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.