Buckets:

rtrm's picture
download
raw
18.2 kB
import{s as Be,n as Ge,o as Je}from"../chunks/scheduler.182ea377.js";import{S as Ke,i as Qe,g as o,s as i,p as w,A as Xe,h as n,f as r,c as a,j as l,q as C,m as f,k as d,v as t,a as m,r as j,d as N,t as L,u as O}from"../chunks/index.008d68e4.js";import{D as Z}from"../chunks/Docstring.7aec8b85.js";import{I as je}from"../chunks/IconCopyLink.96bbb92b.js";function Ye(Ne){let h,ee,u,v,Q,k,_e,V,Le="Prior Transformer",re,I,Oe='The Prior Transformer was originally introduced in <a href="https://huggingface.co/papers/2204.06125" rel="nofollow">Hierarchical Text-Conditional Image Generation with CLIP Latents</a> by Ramesh et al. It is used to predict CLIP image embeddings from CLIP text embeddings; image embeddings are predicted through a denoising diffusion process.',te,A,ke="The abstract from the paper is:",oe,E,Ie="<em>Contrastive models like CLIP have been shown to learn robust representations of images that capture both semantics and style. To leverage these representations for image generation, we propose a two-stage model: a prior that generates a CLIP image embedding given a text caption, and a decoder that generates an image conditioned on the image embedding. We show that explicitly generating image representations improves image diversity with minimal loss in photorealism and caption similarity. Our decoders conditioned on image representations can also produce variations of an image that preserve both its semantics and style, while varying the non-essential details absent from the image representation. Moreover, the joint embedding space of CLIP enables language-guided image manipulations in a zero-shot fashion. We use diffusion models for the decoder and experiment with both autoregressive and diffusion models for the prior, finding that the latter are computationally more efficient and produce higher-quality samples.</em>",ne,_,T,X,S,ge,U,Ae="PriorTransformer",se,c,z,be,W,Ee="A Prior Transformer model.",ve,P,D,Te,R,Se='The <a href="/docs/diffusers/v0.25.0/ja/api/models/prior_transformer#diffusers.PriorTransformer">PriorTransformer</a> forward method.',Pe,x,H,xe,B,ze="Sets the attention processor to use to compute attention.",ye,y,M,$e,G,De="Disables custom attention processors and sets the default attention implementation.",ie,g,$,Y,F,we,J,He="PriorTransformerOutput",ae,b,q,Ce,K,Me='The output of <a href="/docs/diffusers/v0.25.0/ja/api/models/prior_transformer#diffusers.PriorTransformer">PriorTransformer</a>.',de;return k=new je({}),S=new je({}),z=new Z({props:{name:"class diffusers.PriorTransformer",anchor:"diffusers.PriorTransformer",parameters:[{name:"num_attention_heads",val:": int = 32"},{name:"attention_head_dim",val:": int = 64"},{name:"num_layers",val:": int = 20"},{name:"embedding_dim",val:": int = 768"},{name:"num_embeddings",val:" = 77"},{name:"additional_embeddings",val:" = 4"},{name:"dropout",val:": float = 0.0"},{name:"time_embed_act_fn",val:": str = 'silu'"},{name:"norm_in_type",val:": Optional = None"},{name:"embedding_proj_norm_type",val:": Optional = None"},{name:"encoder_hid_proj_type",val:": Optional = 'linear'"},{name:"added_emb_type",val:": Optional = 'prd'"},{name:"time_embed_dim",val:": Optional = None"},{name:"embedding_proj_dim",val:": Optional = None"},{name:"clip_embed_dim",val:": Optional = None"}],parametersDescription:[{anchor:"diffusers.PriorTransformer.num_attention_heads",description:"<strong>num_attention_heads</strong> (<code>int</code>, <em>optional</em>, defaults to 32) &#x2014; The number of heads to use for multi-head attention.",name:"num_attention_heads"},{anchor:"diffusers.PriorTransformer.attention_head_dim",description:"<strong>attention_head_dim</strong> (<code>int</code>, <em>optional</em>, defaults to 64) &#x2014; The number of channels in each head.",name:"attention_head_dim"},{anchor:"diffusers.PriorTransformer.num_layers",description:"<strong>num_layers</strong> (<code>int</code>, <em>optional</em>, defaults to 20) &#x2014; The number of layers of Transformer blocks to use.",name:"num_layers"},{anchor:"diffusers.PriorTransformer.embedding_dim",description:"<strong>embedding_dim</strong> (<code>int</code>, <em>optional</em>, defaults to 768) &#x2014; The dimension of the model input <code>hidden_states</code>",name:"embedding_dim"},{anchor:"diffusers.PriorTransformer.num_embeddings",description:`<strong>num_embeddings</strong> (<code>int</code>, <em>optional</em>, defaults to 77) &#x2014;
The number of embeddings of the model input <code>hidden_states</code>`,name:"num_embeddings"},{anchor:"diffusers.PriorTransformer.additional_embeddings",description:`<strong>additional_embeddings</strong> (<code>int</code>, <em>optional</em>, defaults to 4) &#x2014; The number of additional tokens appended to the
projected <code>hidden_states</code>. The actual length of the used <code>hidden_states</code> is <code>num_embeddings + additional_embeddings</code>.`,name:"additional_embeddings"},{anchor:"diffusers.PriorTransformer.dropout",description:"<strong>dropout</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) &#x2014; The dropout probability to use.",name:"dropout"},{anchor:"diffusers.PriorTransformer.time_embed_act_fn",description:`<strong>time_embed_act_fn</strong> (<code>str</code>, <em>optional</em>, defaults to &#x2018;silu&#x2019;) &#x2014;
The activation function to use to create timestep embeddings.`,name:"time_embed_act_fn"},{anchor:"diffusers.PriorTransformer.norm_in_type",description:`<strong>norm_in_type</strong> (<code>str</code>, <em>optional</em>, defaults to None) &#x2014; The normalization layer to apply on hidden states before
passing to Transformer blocks. Set it to <code>None</code> if normalization is not needed.`,name:"norm_in_type"},{anchor:"diffusers.PriorTransformer.embedding_proj_norm_type",description:`<strong>embedding_proj_norm_type</strong> (<code>str</code>, <em>optional</em>, defaults to None) &#x2014;
The normalization layer to apply on the input <code>proj_embedding</code>. Set it to <code>None</code> if normalization is not
needed.`,name:"embedding_proj_norm_type"},{anchor:"diffusers.PriorTransformer.encoder_hid_proj_type",description:`<strong>encoder_hid_proj_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>linear</code>) &#x2014;
The projection layer to apply on the input <code>encoder_hidden_states</code>. Set it to <code>None</code> if
<code>encoder_hidden_states</code> is <code>None</code>.`,name:"encoder_hid_proj_type"},{anchor:"diffusers.PriorTransformer.added_emb_type",description:`<strong>added_emb_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>prd</code>) &#x2014; Additional embeddings to condition the model.
Choose from <code>prd</code> or <code>None</code>. if choose <code>prd</code>, it will prepend a token indicating the (quantized) dot
product between the text embedding and image embedding as proposed in the unclip paper
<a href="https://arxiv.org/abs/2204.06125" rel="nofollow">https://arxiv.org/abs/2204.06125</a> If it is <code>None</code>, no additional embeddings will be prepended.`,name:"added_emb_type"},{anchor:"diffusers.PriorTransformer.time_embed_dim",description:"<strong>time_embed_dim</strong> (<code>int, *optional*, defaults to None) -- The dimension of timestep embeddings. If None, will be set to </code>num_attention_heads * attention_head_dim`",name:"time_embed_dim"},{anchor:"diffusers.PriorTransformer.embedding_proj_dim",description:`<strong>embedding_proj_dim</strong> (<code>int</code>, <em>optional</em>, default to None) &#x2014;
The dimension of <code>proj_embedding</code>. If None, will be set to <code>embedding_dim</code>.`,name:"embedding_proj_dim"},{anchor:"diffusers.PriorTransformer.clip_embed_dim",description:`<strong>clip_embed_dim</strong> (<code>int</code>, <em>optional</em>, default to None) &#x2014;
The dimension of the output. If None, will be set to <code>embedding_dim</code>.`,name:"clip_embed_dim"}],source:"https://github.com/huggingface/diffusers/blob/v0.25.0/src/diffusers/models/prior_transformer.py#L36"}}),D=new Z({props:{name:"forward",anchor:"diffusers.PriorTransformer.forward",parameters:[{name:"hidden_states",val:""},{name:"timestep",val:": Union"},{name:"proj_embedding",val:": FloatTensor"},{name:"encoder_hidden_states",val:": Optional = None"},{name:"attention_mask",val:": Optional = None"},{name:"return_dict",val:": bool = True"}],parametersDescription:[{anchor:"diffusers.PriorTransformer.forward.hidden_states",description:`<strong>hidden_states</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, embedding_dim)</code>) &#x2014;
The currently predicted image embeddings.`,name:"hidden_states"},{anchor:"diffusers.PriorTransformer.forward.timestep",description:`<strong>timestep</strong> (<code>torch.LongTensor</code>) &#x2014;
Current denoising step.`,name:"timestep"},{anchor:"diffusers.PriorTransformer.forward.proj_embedding",description:`<strong>proj_embedding</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, embedding_dim)</code>) &#x2014;
Projected embedding vector the denoising process is conditioned on.`,name:"proj_embedding"},{anchor:"diffusers.PriorTransformer.forward.encoder_hidden_states",description:`<strong>encoder_hidden_states</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, num_embeddings, embedding_dim)</code>) &#x2014;
Hidden states of the text embeddings the denoising process is conditioned on.`,name:"encoder_hidden_states"},{anchor:"diffusers.PriorTransformer.forward.attention_mask",description:`<strong>attention_mask</strong> (<code>torch.BoolTensor</code> of shape <code>(batch_size, num_embeddings)</code>) &#x2014;
Text mask for the text embeddings.`,name:"attention_mask"},{anchor:"diffusers.PriorTransformer.forward.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) &#x2014;
Whether or not to return a <a href="/docs/diffusers/v0.25.0/ja/api/models/prior_transformer#diffusers.models.prior_transformer.PriorTransformerOutput">PriorTransformerOutput</a> instead of a plain
tuple.`,name:"return_dict"}],source:"https://github.com/huggingface/diffusers/blob/v0.25.0/src/diffusers/models/prior_transformer.py#L247",returnDescription:`
<p>If return_dict is True, a <a
href="/docs/diffusers/v0.25.0/ja/api/models/prior_transformer#diffusers.models.prior_transformer.PriorTransformerOutput"
>PriorTransformerOutput</a> is returned, otherwise a
tuple is returned where the first element is the sample tensor.</p>
`,returnType:`
<p><a
href="/docs/diffusers/v0.25.0/ja/api/models/prior_transformer#diffusers.models.prior_transformer.PriorTransformerOutput"
>PriorTransformerOutput</a> or <code>tuple</code></p>
`}}),H=new Z({props:{name:"set_attn_processor",anchor:"diffusers.PriorTransformer.set_attn_processor",parameters:[{name:"processor",val:": Union"},{name:"_remove_lora",val:" = False"}],parametersDescription:[{anchor:"diffusers.PriorTransformer.set_attn_processor.processor",description:`<strong>processor</strong> (<code>dict</code> of <code>AttentionProcessor</code> or only <code>AttentionProcessor</code>) &#x2014;
The instantiated processor class or a dictionary of processor classes that will be set as the processor
for <strong>all</strong> <code>Attention</code> layers.</p>
<p>If <code>processor</code> is a dict, the key needs to define the path to the corresponding cross attention
processor. This is strongly recommended when setting trainable attention processors.`,name:"processor"}],source:"https://github.com/huggingface/diffusers/blob/v0.25.0/src/diffusers/models/prior_transformer.py#L195"}}),M=new Z({props:{name:"set_default_attn_processor",anchor:"diffusers.PriorTransformer.set_default_attn_processor",parameters:[],source:"https://github.com/huggingface/diffusers/blob/v0.25.0/src/diffusers/models/prior_transformer.py#L232"}}),F=new je({}),q=new Z({props:{name:"class diffusers.models.prior_transformer.PriorTransformerOutput",anchor:"diffusers.models.prior_transformer.PriorTransformerOutput",parameters:[{name:"predicted_image_embedding",val:": FloatTensor"}],parametersDescription:[{anchor:"diffusers.models.prior_transformer.PriorTransformerOutput.predicted_image_embedding",description:`<strong>predicted_image_embedding</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, embedding_dim)</code>) &#x2014;
The predicted CLIP image embedding conditioned on the CLIP text embedding input.`,name:"predicted_image_embedding"}],source:"https://github.com/huggingface/diffusers/blob/v0.25.0/src/diffusers/models/prior_transformer.py#L23"}}),{c(){h=o("meta"),ee=i(),u=o("h1"),v=o("a"),Q=o("span"),w(k.$$.fragment),_e=i(),V=o("span"),V.textContent=Le,re=i(),I=o("p"),I.innerHTML=Oe,te=i(),A=o("p"),A.textContent=ke,oe=i(),E=o("p"),E.innerHTML=Ie,ne=i(),_=o("h2"),T=o("a"),X=o("span"),w(S.$$.fragment),ge=i(),U=o("span"),U.textContent=Ae,se=i(),c=o("div"),w(z.$$.fragment),be=i(),W=o("p"),W.textContent=Ee,ve=i(),P=o("div"),w(D.$$.fragment),Te=i(),R=o("p"),R.innerHTML=Se,Pe=i(),x=o("div"),w(H.$$.fragment),xe=i(),B=o("p"),B.textContent=ze,ye=i(),y=o("div"),w(M.$$.fragment),$e=i(),G=o("p"),G.textContent=De,ie=i(),g=o("h2"),$=o("a"),Y=o("span"),w(F.$$.fragment),we=i(),J=o("span"),J.textContent=He,ae=i(),b=o("div"),w(q.$$.fragment),Ce=i(),K=o("p"),K.innerHTML=Me,this.h()},l(e){const s=Xe("svelte-1phssyn",document.head);h=n(s,"META",{name:!0,content:!0}),s.forEach(r),ee=a(e),u=n(e,"H1",{class:!0});var me=l(u);v=n(me,"A",{id:!0,class:!0,href:!0});var Fe=l(v);Q=n(Fe,"SPAN",{});var qe=l(Q);C(k.$$.fragment,qe),qe.forEach(r),Fe.forEach(r),_e=a(me),V=n(me,"SPAN",{"data-svelte-h":!0}),f(V)!=="svelte-vb9rak"&&(V.textContent=Le),me.forEach(r),re=a(e),I=n(e,"P",{"data-svelte-h":!0}),f(I)!=="svelte-j0j2ni"&&(I.innerHTML=Oe),te=a(e),A=n(e,"P",{"data-svelte-h":!0}),f(A)!=="svelte-1cwsb16"&&(A.textContent=ke),oe=a(e),E=n(e,"P",{"data-svelte-h":!0}),f(E)!=="svelte-ha34c8"&&(E.innerHTML=Ie),ne=a(e),_=n(e,"H2",{class:!0});var ce=l(_);T=n(ce,"A",{id:!0,class:!0,href:!0});var Ve=l(T);X=n(Ve,"SPAN",{});var Ue=l(X);C(S.$$.fragment,Ue),Ue.forEach(r),Ve.forEach(r),ge=a(ce),U=n(ce,"SPAN",{"data-svelte-h":!0}),f(U)!=="svelte-148p2aw"&&(U.textContent=Ae),ce.forEach(r),se=a(e),c=n(e,"DIV",{class:!0});var p=l(c);C(z.$$.fragment,p),be=a(p),W=n(p,"P",{"data-svelte-h":!0}),f(W)!=="svelte-jdqpwr"&&(W.textContent=Ee),ve=a(p),P=n(p,"DIV",{class:!0});var le=l(P);C(D.$$.fragment,le),Te=a(le),R=n(le,"P",{"data-svelte-h":!0}),f(R)!=="svelte-tnzbyb"&&(R.innerHTML=Se),le.forEach(r),Pe=a(p),x=n(p,"DIV",{class:!0});var fe=l(x);C(H.$$.fragment,fe),xe=a(fe),B=n(fe,"P",{"data-svelte-h":!0}),f(B)!=="svelte-1o77hl2"&&(B.textContent=ze),fe.forEach(r),ye=a(p),y=n(p,"DIV",{class:!0});var pe=l(y);C(M.$$.fragment,pe),$e=a(pe),G=n(pe,"P",{"data-svelte-h":!0}),f(G)!=="svelte-1lxcwhv"&&(G.textContent=De),pe.forEach(r),p.forEach(r),ie=a(e),g=n(e,"H2",{class:!0});var he=l(g);$=n(he,"A",{id:!0,class:!0,href:!0});var We=l($);Y=n(We,"SPAN",{});var Re=l(Y);C(F.$$.fragment,Re),Re.forEach(r),We.forEach(r),we=a(he),J=n(he,"SPAN",{"data-svelte-h":!0}),f(J)!=="svelte-ent30z"&&(J.textContent=He),he.forEach(r),ae=a(e),b=n(e,"DIV",{class:!0});var ue=l(b);C(q.$$.fragment,ue),Ce=a(ue),K=n(ue,"P",{"data-svelte-h":!0}),f(K)!=="svelte-11cnw6j"&&(K.innerHTML=Me),ue.forEach(r),this.h()},h(){d(h,"name","hf:doc:metadata"),d(h,"content",JSON.stringify(Ze)),d(v,"id","prior-transformer"),d(v,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),d(v,"href","#prior-transformer"),d(u,"class","relative group"),d(T,"id","diffusers.PriorTransformer"),d(T,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),d(T,"href","#diffusers.PriorTransformer"),d(_,"class","relative group"),d(P,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),d(x,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),d(y,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),d(c,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),d($,"id","diffusers.models.prior_transformer.PriorTransformerOutput"),d($,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),d($,"href","#diffusers.models.prior_transformer.PriorTransformerOutput"),d(g,"class","relative group"),d(b,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,s){t(document.head,h),m(e,ee,s),m(e,u,s),t(u,v),t(v,Q),j(k,Q,null),t(u,_e),t(u,V),m(e,re,s),m(e,I,s),m(e,te,s),m(e,A,s),m(e,oe,s),m(e,E,s),m(e,ne,s),m(e,_,s),t(_,T),t(T,X),j(S,X,null),t(_,ge),t(_,U),m(e,se,s),m(e,c,s),j(z,c,null),t(c,be),t(c,W),t(c,ve),t(c,P),j(D,P,null),t(P,Te),t(P,R),t(c,Pe),t(c,x),j(H,x,null),t(x,xe),t(x,B),t(c,ye),t(c,y),j(M,y,null),t(y,$e),t(y,G),m(e,ie,s),m(e,g,s),t(g,$),t($,Y),j(F,Y,null),t(g,we),t(g,J),m(e,ae,s),m(e,b,s),j(q,b,null),t(b,Ce),t(b,K),de=!0},p:Ge,i(e){de||(N(k.$$.fragment,e),N(S.$$.fragment,e),N(z.$$.fragment,e),N(D.$$.fragment,e),N(H.$$.fragment,e),N(M.$$.fragment,e),N(F.$$.fragment,e),N(q.$$.fragment,e),de=!0)},o(e){L(k.$$.fragment,e),L(S.$$.fragment,e),L(z.$$.fragment,e),L(D.$$.fragment,e),L(H.$$.fragment,e),L(M.$$.fragment,e),L(F.$$.fragment,e),L(q.$$.fragment,e),de=!1},d(e){e&&(r(ee),r(u),r(re),r(I),r(te),r(A),r(oe),r(E),r(ne),r(_),r(se),r(c),r(ie),r(g),r(ae),r(b)),r(h),O(k),O(S),O(z),O(D),O(H),O(M),O(F),O(q)}}}const Ze={local:"prior-transformer",sections:[{local:"diffusers.PriorTransformer",title:"PriorTransformer"},{local:"diffusers.models.prior_transformer.PriorTransformerOutput",title:"PriorTransformerOutput"}],title:"Prior Transformer"};function er(Ne){return Je(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class sr extends Ke{constructor(h){super(),Qe(this,h,er,Ye,Be,{})}}export{sr as component};

Xet Storage Details

Size:
18.2 kB
·
Xet hash:
520070d87f75a458a4c21c45b455fb33375ed489463e72c32384777e45a2384e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.