Buckets:
hf-doc-build/doc / diffusers /v0.16.0 /en /_app /pages /using-diffusers /controlling_generation.mdx-hf-doc-builder.js
| import{S as qf,i as Gf,s as jf,e as a,k as h,w,t as s,M as zf,c as o,d as t,m as u,a as r,x as _,h as f,b as p,G as i,g as l,y as x,q as b,o as E,B as P,v as Lf}from"../../chunks/vendor-hf-doc-builder.js";import{T as Df}from"../../chunks/Tip-hf-doc-builder.js";import{I as $}from"../../chunks/IconCopyLink-hf-doc-builder.js";function Zf(tt){let c,A,m,g,I;return{c(){c=a("p"),A=s(`Pix2Pix Zero is the first model that allows \u201Czero-shot\u201D image editing. This means that the model | |
| can edit an image in less than a minute on a consumer GPU as shown `),m=a("a"),g=s("here"),I=s("."),this.h()},l(y){c=o(y,"P",{});var k=r(c);A=f(k,`Pix2Pix Zero is the first model that allows \u201Czero-shot\u201D image editing. This means that the model | |
| can edit an image in less than a minute on a consumer GPU as shown `),m=o(k,"A",{href:!0});var U=r(m);g=f(U,"here"),U.forEach(t),I=f(k,"."),k.forEach(t),this.h()},h(){p(m,"href","../api/pipelines/stable_diffusion/pix2pix_zero#usage-example")},m(y,k){l(y,c,k),i(c,A),i(c,m),i(m,g),i(c,I)},d(y){y&&t(c)}}}function Cf(tt){let c,A;return{c(){c=a("p"),A=s(`An important distinction between methods like InstructPix2Pix and Pix2Pix Zero is that the former | |
| involves fine-tuning the pre-trained weights while the latter does not. This means that you can | |
| apply Pix2Pix Zero to any of the available Stable Diffusion models.`)},l(m){c=o(m,"P",{});var g=r(c);A=f(g,`An important distinction between methods like InstructPix2Pix and Pix2Pix Zero is that the former | |
| involves fine-tuning the pre-trained weights while the latter does not. This means that you can | |
| apply Pix2Pix Zero to any of the available Stable Diffusion models.`),g.forEach(t)},m(m,g){l(m,c,g),i(c,A)},d(m){m&&t(c)}}}function Mf(tt){let c,A,m,g,I,y,k,U,Io,Ui,it,To,Wi,at,No,Bi,ot,Do,Oi,S,qo,oi,Go,jo,xe,zo,Lo,be,Zo,Co,Ri,rt,Mo,Vi,nt,Ho,Ji,lt,Fo,Ki,d,ri,st,Uo,Wo,ni,ft,Bo,Oo,li,pt,Ro,Vo,si,ht,Jo,Ko,fi,ut,Qo,Xo,pi,dt,Yo,er,hi,ct,tr,ir,ui,mt,ar,or,di,vt,rr,nr,ci,gt,lr,sr,mi,wt,fr,Qi,T,W,vi,Ee,pr,gi,hr,Xi,_t,Pe,ur,Yi,N,xt,dr,cr,ye,mr,vr,ea,B,gr,bt,wr,_r,ta,D,O,wi,$e,xr,_i,br,ia,Et,Ae,Er,aa,Se,Pt,Pr,yr,oa,yt,$r,ra,$t,Ar,na,R,ke,Sr,Ie,kr,Ir,Tr,Te,Nr,Ne,Dr,qr,la,V,sa,J,Gr,At,jr,zr,fa,K,Lr,St,Zr,Cr,pa,q,Q,xi,De,Mr,bi,Hr,ha,kt,qe,Fr,ua,Ge,It,Ur,Wr,da,Tt,Br,ca,X,Or,Ei,Rr,Vr,ma,Y,Jr,Nt,Kr,Qr,va,G,ee,Pi,je,Xr,yi,Yr,ga,Dt,ze,en,wa,qt,tn,_a,Gt,an,xa,jt,on,ba,te,rn,zt,nn,ln,Ea,j,ie,$i,Le,sn,Ai,fn,Pa,Lt,Ze,pn,ya,Ce,Zt,hn,un,$a,Ct,dn,Aa,ae,cn,Mt,mn,vn,Sa,z,oe,Si,Me,gn,ki,wn,ka,Ht,He,_n,Ia,Fe,Ft,xn,bn,Ta,Ut,En,Na,re,Pn,Wt,yn,$n,Da,ne,qa,L,le,Ii,Ue,An,Ti,Sn,Ga,Bt,We,kn,ja,se,In,Ot,Tn,Nn,za,fe,Dn,Rt,qn,Gn,La,Z,pe,Ni,Be,jn,Di,zn,Za,Vt,Ln,Ca,C,he,qi,Oe,Zn,Gi,Cn,Ma,Re,Jt,Mn,Hn,Ha,ue,Fn,Kt,Un,Wn,Fa,M,de,ji,Ve,Bn,zi,On,Ua,Je,Qt,Rn,Vn,Wa,ce,Jn,Xt,Kn,Qn,Ba,H,me,Li,Ke,Xn,Zi,Yn,Oa,Yt,Qe,el,Ra,Xe,ei,tl,il,Va,ve,al,ti,ol,rl,Ja,F,ge,Ci,Ye,nl,Mi,ll,Ka,ii,sl,Qa,we,fl,ai,pl,hl,Xa;return y=new $({}),Ee=new $({}),$e=new $({}),V=new Df({props:{$$slots:{default:[Zf]},$$scope:{ctx:tt}}}),De=new $({}),je=new $({}),Le=new $({}),Me=new $({}),ne=new Df({props:{$$slots:{default:[Cf]},$$scope:{ctx:tt}}}),Ue=new $({}),Be=new $({}),Oe=new $({}),Ve=new $({}),Ke=new $({}),Ye=new $({}),{c(){c=a("meta"),A=h(),m=a("h1"),g=a("a"),I=a("span"),w(y.$$.fragment),k=h(),U=a("span"),Io=s("Controlled generation"),Ui=h(),it=a("p"),To=s("Controlling outputs generated by diffusion models has been long pursued by the community and is now an active research topic. In many popular diffusion models, subtle changes in inputs, both images and text prompts, can drastically change outputs. In an ideal world we want to be able to control how semantics are preserved and changed."),Wi=h(),at=a("p"),No=s("Most examples of preserving semantics reduce to being able to accurately map a change in input to a change in output. I.e. adding an adjective to a subject in a prompt preserves the entire image, only modifying the changed subject. Or, image variation of a particular subject preserves the subject\u2019s pose."),Bi=h(),ot=a("p"),Do=s("Additionally, there are qualities of generated images that we would like to influence beyond semantic preservation. I.e. in general, we would like our outputs to be of good quality, adhere to a particular style, or be realistic."),Oi=h(),S=a("p"),qo=s("We will document some of the techniques "),oi=a("code"),Go=s("diffusers"),jo=s(" supports to control generation of diffusion models. Much is cutting edge research and can be quite nuanced. If something needs clarifying or you have a suggestion, don\u2019t hesitate to open a discussion on the "),xe=a("a"),zo=s("forum"),Lo=s(" or a "),be=a("a"),Zo=s("GitHub issue"),Co=s("."),Ri=h(),rt=a("p"),Mo=s("We provide a high level explanation of how the generation can be controlled as well as a snippet of the technicals. For more in depth explanations on the technicals, the original papers which are linked from the pipelines are always the best resources."),Vi=h(),nt=a("p"),Ho=s("Depending on the use case, one should choose a technique accordingly. In many cases, these techniques can be combined. For example, one can combine Textual Inversion with SEGA to provide more semantic guidance to the outputs generated using Textual Inversion."),Ji=h(),lt=a("p"),Fo=s("Unless otherwise mentioned, these are techniques that work with existing models and don\u2019t require their own weights."),Ki=h(),d=a("ol"),ri=a("li"),st=a("a"),Uo=s("Instruct Pix2Pix"),Wo=h(),ni=a("li"),ft=a("a"),Bo=s("Pix2Pix Zero"),Oo=h(),li=a("li"),pt=a("a"),Ro=s("Attend and Excite"),Vo=h(),si=a("li"),ht=a("a"),Jo=s("Semantic Guidance"),Ko=h(),fi=a("li"),ut=a("a"),Qo=s("Self-attention Guidance"),Xo=h(),pi=a("li"),dt=a("a"),Yo=s("Depth2Image"),er=h(),hi=a("li"),ct=a("a"),tr=s("MultiDiffusion Panorama"),ir=h(),ui=a("li"),mt=a("a"),ar=s("DreamBooth"),or=h(),di=a("li"),vt=a("a"),rr=s("Textual Inversion"),nr=h(),ci=a("li"),gt=a("a"),lr=s("ControlNet"),sr=h(),mi=a("li"),wt=a("a"),fr=s("Prompt Weighting"),Qi=h(),T=a("h2"),W=a("a"),vi=a("span"),w(Ee.$$.fragment),pr=h(),gi=a("span"),hr=s("Instruct Pix2Pix"),Xi=h(),_t=a("p"),Pe=a("a"),ur=s("Paper"),Yi=h(),N=a("p"),xt=a("a"),dr=s("Instruct Pix2Pix"),cr=s(` is fine-tuned from stable diffusion to support editing input images. It takes as inputs an image and a prompt describing an edit, and it outputs the edited image. | |
| Instruct Pix2Pix has been explicitly trained to work well with `),ye=a("a"),mr=s("InstructGPT"),vr=s("-like prompts."),ea=h(),B=a("p"),gr=s("See "),bt=a("a"),wr=s("here"),_r=s(" for more information on how to use it."),ta=h(),D=a("h2"),O=a("a"),wi=a("span"),w($e.$$.fragment),xr=h(),_i=a("span"),br=s("Pix2Pix Zero"),ia=h(),Et=a("p"),Ae=a("a"),Er=s("Paper"),aa=h(),Se=a("p"),Pt=a("a"),Pr=s("Pix2Pix Zero"),yr=s(" allows modifying an image so that one concept or subject is translated to another one while preserving general image semantics."),oa=h(),yt=a("p"),$r=s("The denoising process is guided from one conceptual embedding towards another conceptual embedding. The intermediate latents are optimized during the denoising process to push the attention maps towards reference attention maps. The reference attention maps are from the denoising process of the input image and are used to encourage semantic preservation."),ra=h(),$t=a("p"),Ar=s("Pix2Pix Zero can be used both to edit synthetic images as well as real images."),na=h(),R=a("ul"),ke=a("li"),Sr=s(`To edit synthetic images, one first generates an image given a caption. | |
| Next, we generate image captions for the concept that shall be edited and for the new target concept. We can use a model like `),Ie=a("a"),kr=s("Flan-T5"),Ir=s(" for this purpose. Then, \u201Cmean\u201D prompt embeddings for both the source and target concepts are created via the text encoder. Finally, the pix2pix-zero algorithm is used to edit the synthetic image."),Tr=h(),Te=a("li"),Nr=s("To edit a real image, one first generates an image caption using a model like "),Ne=a("a"),Dr=s("BLIP"),qr=s(". Then one applies ddim inversion on the prompt and image to generate \u201Cinverse\u201D latents. Similar to before, \u201Cmean\u201D prompt embeddings for both source and target concepts are created and finally the pix2pix-zero algorithm in combination with the \u201Cinverse\u201D latents is used to edit the image."),la=h(),w(V.$$.fragment),sa=h(),J=a("p"),Gr=s(`As mentioned above, Pix2Pix Zero includes optimizing the latents (and not any of the UNet, VAE, or the text encoder) to steer the generation toward a specific concept. This means that the overall | |
| pipeline might require more memory than a standard `),At=a("a"),jr=s("StableDiffusionPipeline"),zr=s("."),fa=h(),K=a("p"),Lr=s("See "),St=a("a"),Zr=s("here"),Cr=s(" for more information on how to use it."),pa=h(),q=a("h2"),Q=a("a"),xi=a("span"),w(De.$$.fragment),Mr=h(),bi=a("span"),Hr=s("Attend and Excite"),ha=h(),kt=a("p"),qe=a("a"),Fr=s("Paper"),ua=h(),Ge=a("p"),It=a("a"),Ur=s("Attend and Excite"),Wr=s(" allows subjects in the prompt to be faithfully represented in the final image."),da=h(),Tt=a("p"),Br=s("A set of token indices are given as input, corresponding to the subjects in the prompt that need to be present in the image. During denoising, each token index is guaranteed to have a minimum attention threshold for at least one patch of the image. The intermediate latents are iteratively optimized during the denoising process to strengthen the attention of the most neglected subject token until the attention threshold is passed for all subject tokens."),ca=h(),X=a("p"),Or=s("Like Pix2Pix Zero, Attend and Excite also involves a mini optimization loop (leaving the pre-trained weights untouched) in its pipeline and can require more memory than the usual "),Ei=a("code"),Rr=s("StableDiffusionPipeline"),Vr=s("."),ma=h(),Y=a("p"),Jr=s("See "),Nt=a("a"),Kr=s("here"),Qr=s(" for more information on how to use it."),va=h(),G=a("h2"),ee=a("a"),Pi=a("span"),w(je.$$.fragment),Xr=h(),yi=a("span"),Yr=s("Semantic Guidance (SEGA)"),ga=h(),Dt=a("p"),ze=a("a"),en=s("Paper"),wa=h(),qt=a("p"),tn=s("SEGA allows applying or removing one or more concepts from an image. The strength of the concept can also be controlled. I.e. the smile concept can be used to incrementally increase or decrease the smile of a portrait."),_a=h(),Gt=a("p"),an=s("Similar to how classifier free guidance provides guidance via empty prompt inputs, SEGA provides guidance on conceptual prompts. Multiple of these conceptual prompts can be applied simultaneously. Each conceptual prompt can either add or remove their concept depending on if the guidance is applied positively or negatively."),xa=h(),jt=a("p"),on=s("Unlike Pix2Pix Zero or Attend and Excite, SEGA directly interacts with the diffusion process instead of performing any explicit gradient-based optimization."),ba=h(),te=a("p"),rn=s("See "),zt=a("a"),nn=s("here"),ln=s(" for more information on how to use it."),Ea=h(),j=a("h2"),ie=a("a"),$i=a("span"),w(Le.$$.fragment),sn=h(),Ai=a("span"),fn=s("Self-attention Guidance (SAG)"),Pa=h(),Lt=a("p"),Ze=a("a"),pn=s("Paper"),ya=h(),Ce=a("p"),Zt=a("a"),hn=s("Self-attention Guidance"),un=s(" improves the general quality of images."),$a=h(),Ct=a("p"),dn=s("SAG provides guidance from predictions not conditioned on high-frequency details to fully conditioned images. The high frequency details are extracted out of the UNet self-attention maps."),Aa=h(),ae=a("p"),cn=s("See "),Mt=a("a"),mn=s("here"),vn=s(" for more information on how to use it."),Sa=h(),z=a("h2"),oe=a("a"),Si=a("span"),w(Me.$$.fragment),gn=h(),ki=a("span"),wn=s("Depth2Image"),ka=h(),Ht=a("p"),He=a("a"),_n=s("Project"),Ia=h(),Fe=a("p"),Ft=a("a"),xn=s("Depth2Image"),bn=s(" is fine-tuned from Stable Diffusion to better preserve semantics for text guided image variation."),Ta=h(),Ut=a("p"),En=s("It conditions on a monocular depth estimate of the original image."),Na=h(),re=a("p"),Pn=s("See "),Wt=a("a"),yn=s("here"),$n=s(" for more information on how to use it."),Da=h(),w(ne.$$.fragment),qa=h(),L=a("h2"),le=a("a"),Ii=a("span"),w(Ue.$$.fragment),An=h(),Ti=a("span"),Sn=s("MultiDiffusion Panorama"),Ga=h(),Bt=a("p"),We=a("a"),kn=s("Paper"),ja=h(),se=a("p"),In=s(`MultiDiffusion defines a new generation process over a pre-trained diffusion model. This process binds together multiple diffusion generation methods that can be readily applied to generate high quality and diverse images. Results adhere to user-provided controls, such as desired aspect ratio (e.g., panorama), and spatial guiding signals, ranging from tight segmentation masks to bounding boxes. | |
| `),Ot=a("a"),Tn=s("MultiDiffusion Panorama"),Nn=s(" allows to generate high-quality images at arbitrary aspect ratios (e.g., panoramas)."),za=h(),fe=a("p"),Dn=s("See "),Rt=a("a"),qn=s("here"),Gn=s(" for more information on how to use it to generate panoramic images."),La=h(),Z=a("h2"),pe=a("a"),Ni=a("span"),w(Be.$$.fragment),jn=h(),Di=a("span"),zn=s("Fine-tuning your own models"),Za=h(),Vt=a("p"),Ln=s("In addition to pre-trained models, Diffusers has training scripts for fine-tuning models on user-provided data."),Ca=h(),C=a("h3"),he=a("a"),qi=a("span"),w(Oe.$$.fragment),Zn=h(),Gi=a("span"),Cn=s("DreamBooth"),Ma=h(),Re=a("p"),Jt=a("a"),Mn=s("DreamBooth"),Hn=s(" fine-tunes a model to teach it about a new subject. I.e. a few pictures of a person can be used to generate images of that person in different styles."),Ha=h(),ue=a("p"),Fn=s("See "),Kt=a("a"),Un=s("here"),Wn=s(" for more information on how to use it."),Fa=h(),M=a("h3"),de=a("a"),ji=a("span"),w(Ve.$$.fragment),Bn=h(),zi=a("span"),On=s("Textual Inversion"),Ua=h(),Je=a("p"),Qt=a("a"),Rn=s("Textual Inversion"),Vn=s(" fine-tunes a model to teach it about a new concept. I.e. a few pictures of a style of artwork can be used to generate images in that style."),Wa=h(),ce=a("p"),Jn=s("See "),Xt=a("a"),Kn=s("here"),Qn=s(" for more information on how to use it."),Ba=h(),H=a("h2"),me=a("a"),Li=a("span"),w(Ke.$$.fragment),Xn=h(),Zi=a("span"),Yn=s("ControlNet"),Oa=h(),Yt=a("p"),Qe=a("a"),el=s("Paper"),Ra=h(),Xe=a("p"),ei=a("a"),tl=s("ControlNet"),il=s(` is an auxiliary network which adds an extra condition. | |
| There are 8 canonical pre-trained ControlNets trained on different conditionings such as edge detection, scribbles, | |
| depth maps, and semantic segmentations.`),Va=h(),ve=a("p"),al=s("See "),ti=a("a"),ol=s("here"),rl=s(" for more information on how to use it."),Ja=h(),F=a("h2"),ge=a("a"),Ci=a("span"),w(Ye.$$.fragment),nl=h(),Mi=a("span"),ll=s("Prompt Weighting"),Ka=h(),ii=a("p"),sl=s(`Prompt weighting is a simple technique that puts more attention weight on certain parts of the text | |
| input.`),Qa=h(),we=a("p"),fl=s("For a more in-detail explanation and examples, see "),ai=a("a"),pl=s("here"),hl=s("."),this.h()},l(e){const n=zf('[data-svelte="svelte-1phssyn"]',document.head);c=o(n,"META",{name:!0,content:!0}),n.forEach(t),A=u(e),m=o(e,"H1",{class:!0});var et=r(m);g=o(et,"A",{id:!0,class:!0,href:!0});var Hi=r(g);I=o(Hi,"SPAN",{});var _l=r(I);_(y.$$.fragment,_l),_l.forEach(t),Hi.forEach(t),k=u(et),U=o(et,"SPAN",{});var xl=r(U);Io=f(xl,"Controlled generation"),xl.forEach(t),et.forEach(t),Ui=u(e),it=o(e,"P",{});var bl=r(it);To=f(bl,"Controlling outputs generated by diffusion models has been long pursued by the community and is now an active research topic. In many popular diffusion models, subtle changes in inputs, both images and text prompts, can drastically change outputs. In an ideal world we want to be able to control how semantics are preserved and changed."),bl.forEach(t),Wi=u(e),at=o(e,"P",{});var El=r(at);No=f(El,"Most examples of preserving semantics reduce to being able to accurately map a change in input to a change in output. I.e. adding an adjective to a subject in a prompt preserves the entire image, only modifying the changed subject. Or, image variation of a particular subject preserves the subject\u2019s pose."),El.forEach(t),Bi=u(e),ot=o(e,"P",{});var Pl=r(ot);Do=f(Pl,"Additionally, there are qualities of generated images that we would like to influence beyond semantic preservation. I.e. in general, we would like our outputs to be of good quality, adhere to a particular style, or be realistic."),Pl.forEach(t),Oi=u(e),S=o(e,"P",{});var _e=r(S);qo=f(_e,"We will document some of the techniques "),oi=o(_e,"CODE",{});var yl=r(oi);Go=f(yl,"diffusers"),yl.forEach(t),jo=f(_e," supports to control generation of diffusion models. Much is cutting edge research and can be quite nuanced. If something needs clarifying or you have a suggestion, don\u2019t hesitate to open a discussion on the "),xe=o(_e,"A",{href:!0,rel:!0});var $l=r(xe);zo=f($l,"forum"),$l.forEach(t),Lo=f(_e," or a "),be=o(_e,"A",{href:!0,rel:!0});var Al=r(be);Zo=f(Al,"GitHub issue"),Al.forEach(t),Co=f(_e,"."),_e.forEach(t),Ri=u(e),rt=o(e,"P",{});var Sl=r(rt);Mo=f(Sl,"We provide a high level explanation of how the generation can be controlled as well as a snippet of the technicals. For more in depth explanations on the technicals, the original papers which are linked from the pipelines are always the best resources."),Sl.forEach(t),Vi=u(e),nt=o(e,"P",{});var kl=r(nt);Ho=f(kl,"Depending on the use case, one should choose a technique accordingly. In many cases, these techniques can be combined. For example, one can combine Textual Inversion with SEGA to provide more semantic guidance to the outputs generated using Textual Inversion."),kl.forEach(t),Ji=u(e),lt=o(e,"P",{});var Il=r(lt);Fo=f(Il,"Unless otherwise mentioned, these are techniques that work with existing models and don\u2019t require their own weights."),Il.forEach(t),Ki=u(e),d=o(e,"OL",{});var v=r(d);ri=o(v,"LI",{});var Tl=r(ri);st=o(Tl,"A",{href:!0});var Nl=r(st);Uo=f(Nl,"Instruct Pix2Pix"),Nl.forEach(t),Tl.forEach(t),Wo=u(v),ni=o(v,"LI",{});var Dl=r(ni);ft=o(Dl,"A",{href:!0});var ql=r(ft);Bo=f(ql,"Pix2Pix Zero"),ql.forEach(t),Dl.forEach(t),Oo=u(v),li=o(v,"LI",{});var Gl=r(li);pt=o(Gl,"A",{href:!0});var jl=r(pt);Ro=f(jl,"Attend and Excite"),jl.forEach(t),Gl.forEach(t),Vo=u(v),si=o(v,"LI",{});var zl=r(si);ht=o(zl,"A",{href:!0});var Ll=r(ht);Jo=f(Ll,"Semantic Guidance"),Ll.forEach(t),zl.forEach(t),Ko=u(v),fi=o(v,"LI",{});var Zl=r(fi);ut=o(Zl,"A",{href:!0});var Cl=r(ut);Qo=f(Cl,"Self-attention Guidance"),Cl.forEach(t),Zl.forEach(t),Xo=u(v),pi=o(v,"LI",{});var Ml=r(pi);dt=o(Ml,"A",{href:!0});var Hl=r(dt);Yo=f(Hl,"Depth2Image"),Hl.forEach(t),Ml.forEach(t),er=u(v),hi=o(v,"LI",{});var Fl=r(hi);ct=o(Fl,"A",{href:!0});var Ul=r(ct);tr=f(Ul,"MultiDiffusion Panorama"),Ul.forEach(t),Fl.forEach(t),ir=u(v),ui=o(v,"LI",{});var Wl=r(ui);mt=o(Wl,"A",{href:!0});var Bl=r(mt);ar=f(Bl,"DreamBooth"),Bl.forEach(t),Wl.forEach(t),or=u(v),di=o(v,"LI",{});var Ol=r(di);vt=o(Ol,"A",{href:!0});var Rl=r(vt);rr=f(Rl,"Textual Inversion"),Rl.forEach(t),Ol.forEach(t),nr=u(v),ci=o(v,"LI",{});var Vl=r(ci);gt=o(Vl,"A",{href:!0});var Jl=r(gt);lr=f(Jl,"ControlNet"),Jl.forEach(t),Vl.forEach(t),sr=u(v),mi=o(v,"LI",{});var Kl=r(mi);wt=o(Kl,"A",{href:!0});var Ql=r(wt);fr=f(Ql,"Prompt Weighting"),Ql.forEach(t),Kl.forEach(t),v.forEach(t),Qi=u(e),T=o(e,"H2",{class:!0});var Ya=r(T);W=o(Ya,"A",{id:!0,class:!0,href:!0});var Xl=r(W);vi=o(Xl,"SPAN",{});var Yl=r(vi);_(Ee.$$.fragment,Yl),Yl.forEach(t),Xl.forEach(t),pr=u(Ya),gi=o(Ya,"SPAN",{});var es=r(gi);hr=f(es,"Instruct Pix2Pix"),es.forEach(t),Ya.forEach(t),Xi=u(e),_t=o(e,"P",{});var ts=r(_t);Pe=o(ts,"A",{href:!0,rel:!0});var is=r(Pe);ur=f(is,"Paper"),is.forEach(t),ts.forEach(t),Yi=u(e),N=o(e,"P",{});var Fi=r(N);xt=o(Fi,"A",{href:!0});var as=r(xt);dr=f(as,"Instruct Pix2Pix"),as.forEach(t),cr=f(Fi,` is fine-tuned from stable diffusion to support editing input images. It takes as inputs an image and a prompt describing an edit, and it outputs the edited image. | |
| Instruct Pix2Pix has been explicitly trained to work well with `),ye=o(Fi,"A",{href:!0,rel:!0});var os=r(ye);mr=f(os,"InstructGPT"),os.forEach(t),vr=f(Fi,"-like prompts."),Fi.forEach(t),ea=u(e),B=o(e,"P",{});var eo=r(B);gr=f(eo,"See "),bt=o(eo,"A",{href:!0});var rs=r(bt);wr=f(rs,"here"),rs.forEach(t),_r=f(eo," for more information on how to use it."),eo.forEach(t),ta=u(e),D=o(e,"H2",{class:!0});var to=r(D);O=o(to,"A",{id:!0,class:!0,href:!0});var ns=r(O);wi=o(ns,"SPAN",{});var ls=r(wi);_($e.$$.fragment,ls),ls.forEach(t),ns.forEach(t),xr=u(to),_i=o(to,"SPAN",{});var ss=r(_i);br=f(ss,"Pix2Pix Zero"),ss.forEach(t),to.forEach(t),ia=u(e),Et=o(e,"P",{});var fs=r(Et);Ae=o(fs,"A",{href:!0,rel:!0});var ps=r(Ae);Er=f(ps,"Paper"),ps.forEach(t),fs.forEach(t),aa=u(e),Se=o(e,"P",{});var ul=r(Se);Pt=o(ul,"A",{href:!0});var hs=r(Pt);Pr=f(hs,"Pix2Pix Zero"),hs.forEach(t),yr=f(ul," allows modifying an image so that one concept or subject is translated to another one while preserving general image semantics."),ul.forEach(t),oa=u(e),yt=o(e,"P",{});var us=r(yt);$r=f(us,"The denoising process is guided from one conceptual embedding towards another conceptual embedding. The intermediate latents are optimized during the denoising process to push the attention maps towards reference attention maps. The reference attention maps are from the denoising process of the input image and are used to encourage semantic preservation."),us.forEach(t),ra=u(e),$t=o(e,"P",{});var ds=r($t);Ar=f(ds,"Pix2Pix Zero can be used both to edit synthetic images as well as real images."),ds.forEach(t),na=u(e),R=o(e,"UL",{});var io=r(R);ke=o(io,"LI",{});var ao=r(ke);Sr=f(ao,`To edit synthetic images, one first generates an image given a caption. | |
| Next, we generate image captions for the concept that shall be edited and for the new target concept. We can use a model like `),Ie=o(ao,"A",{href:!0,rel:!0});var cs=r(Ie);kr=f(cs,"Flan-T5"),cs.forEach(t),Ir=f(ao," for this purpose. Then, \u201Cmean\u201D prompt embeddings for both the source and target concepts are created via the text encoder. Finally, the pix2pix-zero algorithm is used to edit the synthetic image."),ao.forEach(t),Tr=u(io),Te=o(io,"LI",{});var oo=r(Te);Nr=f(oo,"To edit a real image, one first generates an image caption using a model like "),Ne=o(oo,"A",{href:!0,rel:!0});var ms=r(Ne);Dr=f(ms,"BLIP"),ms.forEach(t),qr=f(oo,". Then one applies ddim inversion on the prompt and image to generate \u201Cinverse\u201D latents. Similar to before, \u201Cmean\u201D prompt embeddings for both source and target concepts are created and finally the pix2pix-zero algorithm in combination with the \u201Cinverse\u201D latents is used to edit the image."),oo.forEach(t),io.forEach(t),la=u(e),_(V.$$.fragment,e),sa=u(e),J=o(e,"P",{});var ro=r(J);Gr=f(ro,`As mentioned above, Pix2Pix Zero includes optimizing the latents (and not any of the UNet, VAE, or the text encoder) to steer the generation toward a specific concept. This means that the overall | |
| pipeline might require more memory than a standard `),At=o(ro,"A",{href:!0});var vs=r(At);jr=f(vs,"StableDiffusionPipeline"),vs.forEach(t),zr=f(ro,"."),ro.forEach(t),fa=u(e),K=o(e,"P",{});var no=r(K);Lr=f(no,"See "),St=o(no,"A",{href:!0});var gs=r(St);Zr=f(gs,"here"),gs.forEach(t),Cr=f(no," for more information on how to use it."),no.forEach(t),pa=u(e),q=o(e,"H2",{class:!0});var lo=r(q);Q=o(lo,"A",{id:!0,class:!0,href:!0});var ws=r(Q);xi=o(ws,"SPAN",{});var _s=r(xi);_(De.$$.fragment,_s),_s.forEach(t),ws.forEach(t),Mr=u(lo),bi=o(lo,"SPAN",{});var xs=r(bi);Hr=f(xs,"Attend and Excite"),xs.forEach(t),lo.forEach(t),ha=u(e),kt=o(e,"P",{});var bs=r(kt);qe=o(bs,"A",{href:!0,rel:!0});var Es=r(qe);Fr=f(Es,"Paper"),Es.forEach(t),bs.forEach(t),ua=u(e),Ge=o(e,"P",{});var dl=r(Ge);It=o(dl,"A",{href:!0});var Ps=r(It);Ur=f(Ps,"Attend and Excite"),Ps.forEach(t),Wr=f(dl," allows subjects in the prompt to be faithfully represented in the final image."),dl.forEach(t),da=u(e),Tt=o(e,"P",{});var ys=r(Tt);Br=f(ys,"A set of token indices are given as input, corresponding to the subjects in the prompt that need to be present in the image. During denoising, each token index is guaranteed to have a minimum attention threshold for at least one patch of the image. The intermediate latents are iteratively optimized during the denoising process to strengthen the attention of the most neglected subject token until the attention threshold is passed for all subject tokens."),ys.forEach(t),ca=u(e),X=o(e,"P",{});var so=r(X);Or=f(so,"Like Pix2Pix Zero, Attend and Excite also involves a mini optimization loop (leaving the pre-trained weights untouched) in its pipeline and can require more memory than the usual "),Ei=o(so,"CODE",{});var $s=r(Ei);Rr=f($s,"StableDiffusionPipeline"),$s.forEach(t),Vr=f(so,"."),so.forEach(t),ma=u(e),Y=o(e,"P",{});var fo=r(Y);Jr=f(fo,"See "),Nt=o(fo,"A",{href:!0});var As=r(Nt);Kr=f(As,"here"),As.forEach(t),Qr=f(fo," for more information on how to use it."),fo.forEach(t),va=u(e),G=o(e,"H2",{class:!0});var po=r(G);ee=o(po,"A",{id:!0,class:!0,href:!0});var Ss=r(ee);Pi=o(Ss,"SPAN",{});var ks=r(Pi);_(je.$$.fragment,ks),ks.forEach(t),Ss.forEach(t),Xr=u(po),yi=o(po,"SPAN",{});var Is=r(yi);Yr=f(Is,"Semantic Guidance (SEGA)"),Is.forEach(t),po.forEach(t),ga=u(e),Dt=o(e,"P",{});var Ts=r(Dt);ze=o(Ts,"A",{href:!0,rel:!0});var Ns=r(ze);en=f(Ns,"Paper"),Ns.forEach(t),Ts.forEach(t),wa=u(e),qt=o(e,"P",{});var Ds=r(qt);tn=f(Ds,"SEGA allows applying or removing one or more concepts from an image. The strength of the concept can also be controlled. I.e. the smile concept can be used to incrementally increase or decrease the smile of a portrait."),Ds.forEach(t),_a=u(e),Gt=o(e,"P",{});var qs=r(Gt);an=f(qs,"Similar to how classifier free guidance provides guidance via empty prompt inputs, SEGA provides guidance on conceptual prompts. Multiple of these conceptual prompts can be applied simultaneously. Each conceptual prompt can either add or remove their concept depending on if the guidance is applied positively or negatively."),qs.forEach(t),xa=u(e),jt=o(e,"P",{});var Gs=r(jt);on=f(Gs,"Unlike Pix2Pix Zero or Attend and Excite, SEGA directly interacts with the diffusion process instead of performing any explicit gradient-based optimization."),Gs.forEach(t),ba=u(e),te=o(e,"P",{});var ho=r(te);rn=f(ho,"See "),zt=o(ho,"A",{href:!0});var js=r(zt);nn=f(js,"here"),js.forEach(t),ln=f(ho," for more information on how to use it."),ho.forEach(t),Ea=u(e),j=o(e,"H2",{class:!0});var uo=r(j);ie=o(uo,"A",{id:!0,class:!0,href:!0});var zs=r(ie);$i=o(zs,"SPAN",{});var Ls=r($i);_(Le.$$.fragment,Ls),Ls.forEach(t),zs.forEach(t),sn=u(uo),Ai=o(uo,"SPAN",{});var Zs=r(Ai);fn=f(Zs,"Self-attention Guidance (SAG)"),Zs.forEach(t),uo.forEach(t),Pa=u(e),Lt=o(e,"P",{});var Cs=r(Lt);Ze=o(Cs,"A",{href:!0,rel:!0});var Ms=r(Ze);pn=f(Ms,"Paper"),Ms.forEach(t),Cs.forEach(t),ya=u(e),Ce=o(e,"P",{});var cl=r(Ce);Zt=o(cl,"A",{href:!0});var Hs=r(Zt);hn=f(Hs,"Self-attention Guidance"),Hs.forEach(t),un=f(cl," improves the general quality of images."),cl.forEach(t),$a=u(e),Ct=o(e,"P",{});var Fs=r(Ct);dn=f(Fs,"SAG provides guidance from predictions not conditioned on high-frequency details to fully conditioned images. The high frequency details are extracted out of the UNet self-attention maps."),Fs.forEach(t),Aa=u(e),ae=o(e,"P",{});var co=r(ae);cn=f(co,"See "),Mt=o(co,"A",{href:!0});var Us=r(Mt);mn=f(Us,"here"),Us.forEach(t),vn=f(co," for more information on how to use it."),co.forEach(t),Sa=u(e),z=o(e,"H2",{class:!0});var mo=r(z);oe=o(mo,"A",{id:!0,class:!0,href:!0});var Ws=r(oe);Si=o(Ws,"SPAN",{});var Bs=r(Si);_(Me.$$.fragment,Bs),Bs.forEach(t),Ws.forEach(t),gn=u(mo),ki=o(mo,"SPAN",{});var Os=r(ki);wn=f(Os,"Depth2Image"),Os.forEach(t),mo.forEach(t),ka=u(e),Ht=o(e,"P",{});var Rs=r(Ht);He=o(Rs,"A",{href:!0,rel:!0});var Vs=r(He);_n=f(Vs,"Project"),Vs.forEach(t),Rs.forEach(t),Ia=u(e),Fe=o(e,"P",{});var ml=r(Fe);Ft=o(ml,"A",{href:!0});var Js=r(Ft);xn=f(Js,"Depth2Image"),Js.forEach(t),bn=f(ml," is fine-tuned from Stable Diffusion to better preserve semantics for text guided image variation."),ml.forEach(t),Ta=u(e),Ut=o(e,"P",{});var Ks=r(Ut);En=f(Ks,"It conditions on a monocular depth estimate of the original image."),Ks.forEach(t),Na=u(e),re=o(e,"P",{});var vo=r(re);Pn=f(vo,"See "),Wt=o(vo,"A",{href:!0});var Qs=r(Wt);yn=f(Qs,"here"),Qs.forEach(t),$n=f(vo," for more information on how to use it."),vo.forEach(t),Da=u(e),_(ne.$$.fragment,e),qa=u(e),L=o(e,"H2",{class:!0});var go=r(L);le=o(go,"A",{id:!0,class:!0,href:!0});var Xs=r(le);Ii=o(Xs,"SPAN",{});var Ys=r(Ii);_(Ue.$$.fragment,Ys),Ys.forEach(t),Xs.forEach(t),An=u(go),Ti=o(go,"SPAN",{});var ef=r(Ti);Sn=f(ef,"MultiDiffusion Panorama"),ef.forEach(t),go.forEach(t),Ga=u(e),Bt=o(e,"P",{});var tf=r(Bt);We=o(tf,"A",{href:!0,rel:!0});var af=r(We);kn=f(af,"Paper"),af.forEach(t),tf.forEach(t),ja=u(e),se=o(e,"P",{});var wo=r(se);In=f(wo,`MultiDiffusion defines a new generation process over a pre-trained diffusion model. This process binds together multiple diffusion generation methods that can be readily applied to generate high quality and diverse images. Results adhere to user-provided controls, such as desired aspect ratio (e.g., panorama), and spatial guiding signals, ranging from tight segmentation masks to bounding boxes. | |
| `),Ot=o(wo,"A",{href:!0});var of=r(Ot);Tn=f(of,"MultiDiffusion Panorama"),of.forEach(t),Nn=f(wo," allows to generate high-quality images at arbitrary aspect ratios (e.g., panoramas)."),wo.forEach(t),za=u(e),fe=o(e,"P",{});var _o=r(fe);Dn=f(_o,"See "),Rt=o(_o,"A",{href:!0});var rf=r(Rt);qn=f(rf,"here"),rf.forEach(t),Gn=f(_o," for more information on how to use it to generate panoramic images."),_o.forEach(t),La=u(e),Z=o(e,"H2",{class:!0});var xo=r(Z);pe=o(xo,"A",{id:!0,class:!0,href:!0});var nf=r(pe);Ni=o(nf,"SPAN",{});var lf=r(Ni);_(Be.$$.fragment,lf),lf.forEach(t),nf.forEach(t),jn=u(xo),Di=o(xo,"SPAN",{});var sf=r(Di);zn=f(sf,"Fine-tuning your own models"),sf.forEach(t),xo.forEach(t),Za=u(e),Vt=o(e,"P",{});var ff=r(Vt);Ln=f(ff,"In addition to pre-trained models, Diffusers has training scripts for fine-tuning models on user-provided data."),ff.forEach(t),Ca=u(e),C=o(e,"H3",{class:!0});var bo=r(C);he=o(bo,"A",{id:!0,class:!0,href:!0});var pf=r(he);qi=o(pf,"SPAN",{});var hf=r(qi);_(Oe.$$.fragment,hf),hf.forEach(t),pf.forEach(t),Zn=u(bo),Gi=o(bo,"SPAN",{});var uf=r(Gi);Cn=f(uf,"DreamBooth"),uf.forEach(t),bo.forEach(t),Ma=u(e),Re=o(e,"P",{});var vl=r(Re);Jt=o(vl,"A",{href:!0});var df=r(Jt);Mn=f(df,"DreamBooth"),df.forEach(t),Hn=f(vl," fine-tunes a model to teach it about a new subject. I.e. a few pictures of a person can be used to generate images of that person in different styles."),vl.forEach(t),Ha=u(e),ue=o(e,"P",{});var Eo=r(ue);Fn=f(Eo,"See "),Kt=o(Eo,"A",{href:!0});var cf=r(Kt);Un=f(cf,"here"),cf.forEach(t),Wn=f(Eo," for more information on how to use it."),Eo.forEach(t),Fa=u(e),M=o(e,"H3",{class:!0});var Po=r(M);de=o(Po,"A",{id:!0,class:!0,href:!0});var mf=r(de);ji=o(mf,"SPAN",{});var vf=r(ji);_(Ve.$$.fragment,vf),vf.forEach(t),mf.forEach(t),Bn=u(Po),zi=o(Po,"SPAN",{});var gf=r(zi);On=f(gf,"Textual Inversion"),gf.forEach(t),Po.forEach(t),Ua=u(e),Je=o(e,"P",{});var gl=r(Je);Qt=o(gl,"A",{href:!0});var wf=r(Qt);Rn=f(wf,"Textual Inversion"),wf.forEach(t),Vn=f(gl," fine-tunes a model to teach it about a new concept. I.e. a few pictures of a style of artwork can be used to generate images in that style."),gl.forEach(t),Wa=u(e),ce=o(e,"P",{});var yo=r(ce);Jn=f(yo,"See "),Xt=o(yo,"A",{href:!0});var _f=r(Xt);Kn=f(_f,"here"),_f.forEach(t),Qn=f(yo," for more information on how to use it."),yo.forEach(t),Ba=u(e),H=o(e,"H2",{class:!0});var $o=r(H);me=o($o,"A",{id:!0,class:!0,href:!0});var xf=r(me);Li=o(xf,"SPAN",{});var bf=r(Li);_(Ke.$$.fragment,bf),bf.forEach(t),xf.forEach(t),Xn=u($o),Zi=o($o,"SPAN",{});var Ef=r(Zi);Yn=f(Ef,"ControlNet"),Ef.forEach(t),$o.forEach(t),Oa=u(e),Yt=o(e,"P",{});var Pf=r(Yt);Qe=o(Pf,"A",{href:!0,rel:!0});var yf=r(Qe);el=f(yf,"Paper"),yf.forEach(t),Pf.forEach(t),Ra=u(e),Xe=o(e,"P",{});var wl=r(Xe);ei=o(wl,"A",{href:!0});var $f=r(ei);tl=f($f,"ControlNet"),$f.forEach(t),il=f(wl,` is an auxiliary network which adds an extra condition. | |
| There are 8 canonical pre-trained ControlNets trained on different conditionings such as edge detection, scribbles, | |
| depth maps, and semantic segmentations.`),wl.forEach(t),Va=u(e),ve=o(e,"P",{});var Ao=r(ve);al=f(Ao,"See "),ti=o(Ao,"A",{href:!0});var Af=r(ti);ol=f(Af,"here"),Af.forEach(t),rl=f(Ao," for more information on how to use it."),Ao.forEach(t),Ja=u(e),F=o(e,"H2",{class:!0});var So=r(F);ge=o(So,"A",{id:!0,class:!0,href:!0});var Sf=r(ge);Ci=o(Sf,"SPAN",{});var kf=r(Ci);_(Ye.$$.fragment,kf),kf.forEach(t),Sf.forEach(t),nl=u(So),Mi=o(So,"SPAN",{});var If=r(Mi);ll=f(If,"Prompt Weighting"),If.forEach(t),So.forEach(t),Ka=u(e),ii=o(e,"P",{});var Tf=r(ii);sl=f(Tf,`Prompt weighting is a simple technique that puts more attention weight on certain parts of the text | |
| input.`),Tf.forEach(t),Qa=u(e),we=o(e,"P",{});var ko=r(we);fl=f(ko,"For a more in-detail explanation and examples, see "),ai=o(ko,"A",{href:!0});var Nf=r(ai);pl=f(Nf,"here"),Nf.forEach(t),hl=f(ko,"."),ko.forEach(t),this.h()},h(){p(c,"name","hf:doc:metadata"),p(c,"content",JSON.stringify(Hf)),p(g,"id","controlled-generation"),p(g,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(g,"href","#controlled-generation"),p(m,"class","relative group"),p(xe,"href","https://discuss.huggingface.co/"),p(xe,"rel","nofollow"),p(be,"href","https://github.com/huggingface/diffusers/issues"),p(be,"rel","nofollow"),p(st,"href","#instruct-pix2pix"),p(ft,"href","#pix2pixzero"),p(pt,"href","#attend-and-excite"),p(ht,"href","#semantic-guidance"),p(ut,"href","#self-attention-guidance"),p(dt,"href","#depth2image"),p(ct,"href","#multidiffusion-panorama"),p(mt,"href","#dreambooth"),p(vt,"href","#textual-inversion"),p(gt,"href","#controlnet"),p(wt,"href","#prompt-weighting"),p(W,"id","instruct-pix2pix"),p(W,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(W,"href","#instruct-pix2pix"),p(T,"class","relative group"),p(Pe,"href","https://arxiv.org/abs/2211.09800"),p(Pe,"rel","nofollow"),p(xt,"href","../api/pipelines/stable_diffusion/pix2pix"),p(ye,"href","https://openai.com/blog/instruction-following/"),p(ye,"rel","nofollow"),p(bt,"href","../api/pipelines/stable_diffusion/pix2pix"),p(O,"id","pix2pix-zero"),p(O,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(O,"href","#pix2pix-zero"),p(D,"class","relative group"),p(Ae,"href","https://arxiv.org/abs/2302.03027"),p(Ae,"rel","nofollow"),p(Pt,"href","../api/pipelines/stable_diffusion/pix2pix_zero"),p(Ie,"href","https://huggingface.co/docs/transformers/model_doc/flan-t5"),p(Ie,"rel","nofollow"),p(Ne,"href","https://huggingface.co/docs/transformers/model_doc/blip"),p(Ne,"rel","nofollow"),p(At,"href","../api/pipelines/stable_diffusion/text2img"),p(St,"href","../api/pipelines/stable_diffusion/pix2pix_zero"),p(Q,"id","attend-and-excite"),p(Q,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(Q,"href","#attend-and-excite"),p(q,"class","relative group"),p(qe,"href","https://arxiv.org/abs/2301.13826"),p(qe,"rel","nofollow"),p(It,"href","../api/pipelines/stable_diffusion/attend_and_excite"),p(Nt,"href","../api/pipelines/stable_diffusion/attend_and_excite"),p(ee,"id","semantic-guidance-sega"),p(ee,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(ee,"href","#semantic-guidance-sega"),p(G,"class","relative group"),p(ze,"href","https://arxiv.org/abs/2301.12247"),p(ze,"rel","nofollow"),p(zt,"href","../api/pipelines/semantic_stable_diffusion"),p(ie,"id","selfattention-guidance-sag"),p(ie,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(ie,"href","#selfattention-guidance-sag"),p(j,"class","relative group"),p(Ze,"href","https://arxiv.org/abs/2210.00939"),p(Ze,"rel","nofollow"),p(Zt,"href","../api/pipelines/stable_diffusion/self_attention_guidance"),p(Mt,"href","../api/pipelines/stable_diffusion/self_attention_guidance"),p(oe,"id","depth2image"),p(oe,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(oe,"href","#depth2image"),p(z,"class","relative group"),p(He,"href","https://huggingface.co/stabilityai/stable-diffusion-2-depth"),p(He,"rel","nofollow"),p(Ft,"href","../pipelines/stable_diffusion_2#depthtoimage"),p(Wt,"href","../api/pipelines/stable_diffusion_2#depthtoimage"),p(le,"id","multidiffusion-panorama"),p(le,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(le,"href","#multidiffusion-panorama"),p(L,"class","relative group"),p(We,"href","https://arxiv.org/abs/2302.08113"),p(We,"rel","nofollow"),p(Ot,"href","../api/pipelines/stable_diffusion/panorama"),p(Rt,"href","../api/pipelines/stable_diffusion/panorama"),p(pe,"id","finetuning-your-own-models"),p(pe,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(pe,"href","#finetuning-your-own-models"),p(Z,"class","relative group"),p(he,"id","dreambooth"),p(he,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(he,"href","#dreambooth"),p(C,"class","relative group"),p(Jt,"href","../training/dreambooth"),p(Kt,"href","../training/dreambooth"),p(de,"id","textual-inversion"),p(de,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(de,"href","#textual-inversion"),p(M,"class","relative group"),p(Qt,"href","../training/text_inversion"),p(Xt,"href","../training/text_inversion"),p(me,"id","controlnet"),p(me,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(me,"href","#controlnet"),p(H,"class","relative group"),p(Qe,"href","https://arxiv.org/abs/2302.05543"),p(Qe,"rel","nofollow"),p(ei,"href","../api/pipelines/stable_diffusion/controlnet"),p(ti,"href","../api/pipelines/stable_diffusion/controlnet"),p(ge,"id","prompt-weighting"),p(ge,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(ge,"href","#prompt-weighting"),p(F,"class","relative group"),p(ai,"href","../using-diffusers/weighted_prompts")},m(e,n){i(document.head,c),l(e,A,n),l(e,m,n),i(m,g),i(g,I),x(y,I,null),i(m,k),i(m,U),i(U,Io),l(e,Ui,n),l(e,it,n),i(it,To),l(e,Wi,n),l(e,at,n),i(at,No),l(e,Bi,n),l(e,ot,n),i(ot,Do),l(e,Oi,n),l(e,S,n),i(S,qo),i(S,oi),i(oi,Go),i(S,jo),i(S,xe),i(xe,zo),i(S,Lo),i(S,be),i(be,Zo),i(S,Co),l(e,Ri,n),l(e,rt,n),i(rt,Mo),l(e,Vi,n),l(e,nt,n),i(nt,Ho),l(e,Ji,n),l(e,lt,n),i(lt,Fo),l(e,Ki,n),l(e,d,n),i(d,ri),i(ri,st),i(st,Uo),i(d,Wo),i(d,ni),i(ni,ft),i(ft,Bo),i(d,Oo),i(d,li),i(li,pt),i(pt,Ro),i(d,Vo),i(d,si),i(si,ht),i(ht,Jo),i(d,Ko),i(d,fi),i(fi,ut),i(ut,Qo),i(d,Xo),i(d,pi),i(pi,dt),i(dt,Yo),i(d,er),i(d,hi),i(hi,ct),i(ct,tr),i(d,ir),i(d,ui),i(ui,mt),i(mt,ar),i(d,or),i(d,di),i(di,vt),i(vt,rr),i(d,nr),i(d,ci),i(ci,gt),i(gt,lr),i(d,sr),i(d,mi),i(mi,wt),i(wt,fr),l(e,Qi,n),l(e,T,n),i(T,W),i(W,vi),x(Ee,vi,null),i(T,pr),i(T,gi),i(gi,hr),l(e,Xi,n),l(e,_t,n),i(_t,Pe),i(Pe,ur),l(e,Yi,n),l(e,N,n),i(N,xt),i(xt,dr),i(N,cr),i(N,ye),i(ye,mr),i(N,vr),l(e,ea,n),l(e,B,n),i(B,gr),i(B,bt),i(bt,wr),i(B,_r),l(e,ta,n),l(e,D,n),i(D,O),i(O,wi),x($e,wi,null),i(D,xr),i(D,_i),i(_i,br),l(e,ia,n),l(e,Et,n),i(Et,Ae),i(Ae,Er),l(e,aa,n),l(e,Se,n),i(Se,Pt),i(Pt,Pr),i(Se,yr),l(e,oa,n),l(e,yt,n),i(yt,$r),l(e,ra,n),l(e,$t,n),i($t,Ar),l(e,na,n),l(e,R,n),i(R,ke),i(ke,Sr),i(ke,Ie),i(Ie,kr),i(ke,Ir),i(R,Tr),i(R,Te),i(Te,Nr),i(Te,Ne),i(Ne,Dr),i(Te,qr),l(e,la,n),x(V,e,n),l(e,sa,n),l(e,J,n),i(J,Gr),i(J,At),i(At,jr),i(J,zr),l(e,fa,n),l(e,K,n),i(K,Lr),i(K,St),i(St,Zr),i(K,Cr),l(e,pa,n),l(e,q,n),i(q,Q),i(Q,xi),x(De,xi,null),i(q,Mr),i(q,bi),i(bi,Hr),l(e,ha,n),l(e,kt,n),i(kt,qe),i(qe,Fr),l(e,ua,n),l(e,Ge,n),i(Ge,It),i(It,Ur),i(Ge,Wr),l(e,da,n),l(e,Tt,n),i(Tt,Br),l(e,ca,n),l(e,X,n),i(X,Or),i(X,Ei),i(Ei,Rr),i(X,Vr),l(e,ma,n),l(e,Y,n),i(Y,Jr),i(Y,Nt),i(Nt,Kr),i(Y,Qr),l(e,va,n),l(e,G,n),i(G,ee),i(ee,Pi),x(je,Pi,null),i(G,Xr),i(G,yi),i(yi,Yr),l(e,ga,n),l(e,Dt,n),i(Dt,ze),i(ze,en),l(e,wa,n),l(e,qt,n),i(qt,tn),l(e,_a,n),l(e,Gt,n),i(Gt,an),l(e,xa,n),l(e,jt,n),i(jt,on),l(e,ba,n),l(e,te,n),i(te,rn),i(te,zt),i(zt,nn),i(te,ln),l(e,Ea,n),l(e,j,n),i(j,ie),i(ie,$i),x(Le,$i,null),i(j,sn),i(j,Ai),i(Ai,fn),l(e,Pa,n),l(e,Lt,n),i(Lt,Ze),i(Ze,pn),l(e,ya,n),l(e,Ce,n),i(Ce,Zt),i(Zt,hn),i(Ce,un),l(e,$a,n),l(e,Ct,n),i(Ct,dn),l(e,Aa,n),l(e,ae,n),i(ae,cn),i(ae,Mt),i(Mt,mn),i(ae,vn),l(e,Sa,n),l(e,z,n),i(z,oe),i(oe,Si),x(Me,Si,null),i(z,gn),i(z,ki),i(ki,wn),l(e,ka,n),l(e,Ht,n),i(Ht,He),i(He,_n),l(e,Ia,n),l(e,Fe,n),i(Fe,Ft),i(Ft,xn),i(Fe,bn),l(e,Ta,n),l(e,Ut,n),i(Ut,En),l(e,Na,n),l(e,re,n),i(re,Pn),i(re,Wt),i(Wt,yn),i(re,$n),l(e,Da,n),x(ne,e,n),l(e,qa,n),l(e,L,n),i(L,le),i(le,Ii),x(Ue,Ii,null),i(L,An),i(L,Ti),i(Ti,Sn),l(e,Ga,n),l(e,Bt,n),i(Bt,We),i(We,kn),l(e,ja,n),l(e,se,n),i(se,In),i(se,Ot),i(Ot,Tn),i(se,Nn),l(e,za,n),l(e,fe,n),i(fe,Dn),i(fe,Rt),i(Rt,qn),i(fe,Gn),l(e,La,n),l(e,Z,n),i(Z,pe),i(pe,Ni),x(Be,Ni,null),i(Z,jn),i(Z,Di),i(Di,zn),l(e,Za,n),l(e,Vt,n),i(Vt,Ln),l(e,Ca,n),l(e,C,n),i(C,he),i(he,qi),x(Oe,qi,null),i(C,Zn),i(C,Gi),i(Gi,Cn),l(e,Ma,n),l(e,Re,n),i(Re,Jt),i(Jt,Mn),i(Re,Hn),l(e,Ha,n),l(e,ue,n),i(ue,Fn),i(ue,Kt),i(Kt,Un),i(ue,Wn),l(e,Fa,n),l(e,M,n),i(M,de),i(de,ji),x(Ve,ji,null),i(M,Bn),i(M,zi),i(zi,On),l(e,Ua,n),l(e,Je,n),i(Je,Qt),i(Qt,Rn),i(Je,Vn),l(e,Wa,n),l(e,ce,n),i(ce,Jn),i(ce,Xt),i(Xt,Kn),i(ce,Qn),l(e,Ba,n),l(e,H,n),i(H,me),i(me,Li),x(Ke,Li,null),i(H,Xn),i(H,Zi),i(Zi,Yn),l(e,Oa,n),l(e,Yt,n),i(Yt,Qe),i(Qe,el),l(e,Ra,n),l(e,Xe,n),i(Xe,ei),i(ei,tl),i(Xe,il),l(e,Va,n),l(e,ve,n),i(ve,al),i(ve,ti),i(ti,ol),i(ve,rl),l(e,Ja,n),l(e,F,n),i(F,ge),i(ge,Ci),x(Ye,Ci,null),i(F,nl),i(F,Mi),i(Mi,ll),l(e,Ka,n),l(e,ii,n),i(ii,sl),l(e,Qa,n),l(e,we,n),i(we,fl),i(we,ai),i(ai,pl),i(we,hl),Xa=!0},p(e,[n]){const et={};n&2&&(et.$$scope={dirty:n,ctx:e}),V.$set(et);const Hi={};n&2&&(Hi.$$scope={dirty:n,ctx:e}),ne.$set(Hi)},i(e){Xa||(b(y.$$.fragment,e),b(Ee.$$.fragment,e),b($e.$$.fragment,e),b(V.$$.fragment,e),b(De.$$.fragment,e),b(je.$$.fragment,e),b(Le.$$.fragment,e),b(Me.$$.fragment,e),b(ne.$$.fragment,e),b(Ue.$$.fragment,e),b(Be.$$.fragment,e),b(Oe.$$.fragment,e),b(Ve.$$.fragment,e),b(Ke.$$.fragment,e),b(Ye.$$.fragment,e),Xa=!0)},o(e){E(y.$$.fragment,e),E(Ee.$$.fragment,e),E($e.$$.fragment,e),E(V.$$.fragment,e),E(De.$$.fragment,e),E(je.$$.fragment,e),E(Le.$$.fragment,e),E(Me.$$.fragment,e),E(ne.$$.fragment,e),E(Ue.$$.fragment,e),E(Be.$$.fragment,e),E(Oe.$$.fragment,e),E(Ve.$$.fragment,e),E(Ke.$$.fragment,e),E(Ye.$$.fragment,e),Xa=!1},d(e){t(c),e&&t(A),e&&t(m),P(y),e&&t(Ui),e&&t(it),e&&t(Wi),e&&t(at),e&&t(Bi),e&&t(ot),e&&t(Oi),e&&t(S),e&&t(Ri),e&&t(rt),e&&t(Vi),e&&t(nt),e&&t(Ji),e&&t(lt),e&&t(Ki),e&&t(d),e&&t(Qi),e&&t(T),P(Ee),e&&t(Xi),e&&t(_t),e&&t(Yi),e&&t(N),e&&t(ea),e&&t(B),e&&t(ta),e&&t(D),P($e),e&&t(ia),e&&t(Et),e&&t(aa),e&&t(Se),e&&t(oa),e&&t(yt),e&&t(ra),e&&t($t),e&&t(na),e&&t(R),e&&t(la),P(V,e),e&&t(sa),e&&t(J),e&&t(fa),e&&t(K),e&&t(pa),e&&t(q),P(De),e&&t(ha),e&&t(kt),e&&t(ua),e&&t(Ge),e&&t(da),e&&t(Tt),e&&t(ca),e&&t(X),e&&t(ma),e&&t(Y),e&&t(va),e&&t(G),P(je),e&&t(ga),e&&t(Dt),e&&t(wa),e&&t(qt),e&&t(_a),e&&t(Gt),e&&t(xa),e&&t(jt),e&&t(ba),e&&t(te),e&&t(Ea),e&&t(j),P(Le),e&&t(Pa),e&&t(Lt),e&&t(ya),e&&t(Ce),e&&t($a),e&&t(Ct),e&&t(Aa),e&&t(ae),e&&t(Sa),e&&t(z),P(Me),e&&t(ka),e&&t(Ht),e&&t(Ia),e&&t(Fe),e&&t(Ta),e&&t(Ut),e&&t(Na),e&&t(re),e&&t(Da),P(ne,e),e&&t(qa),e&&t(L),P(Ue),e&&t(Ga),e&&t(Bt),e&&t(ja),e&&t(se),e&&t(za),e&&t(fe),e&&t(La),e&&t(Z),P(Be),e&&t(Za),e&&t(Vt),e&&t(Ca),e&&t(C),P(Oe),e&&t(Ma),e&&t(Re),e&&t(Ha),e&&t(ue),e&&t(Fa),e&&t(M),P(Ve),e&&t(Ua),e&&t(Je),e&&t(Wa),e&&t(ce),e&&t(Ba),e&&t(H),P(Ke),e&&t(Oa),e&&t(Yt),e&&t(Ra),e&&t(Xe),e&&t(Va),e&&t(ve),e&&t(Ja),e&&t(F),P(Ye),e&&t(Ka),e&&t(ii),e&&t(Qa),e&&t(we)}}}const Hf={local:"controlled-generation",sections:[{local:"instruct-pix2pix",title:"Instruct Pix2Pix"},{local:"pix2pix-zero",title:"Pix2Pix Zero"},{local:"attend-and-excite",title:"Attend and Excite"},{local:"semantic-guidance-sega",title:"Semantic Guidance (SEGA)"},{local:"selfattention-guidance-sag",title:"Self-attention Guidance (SAG)"},{local:"depth2image",title:"Depth2Image"},{local:"multidiffusion-panorama",title:"MultiDiffusion Panorama"},{local:"finetuning-your-own-models",sections:[{local:"dreambooth",title:"DreamBooth"},{local:"textual-inversion",title:"Textual Inversion"}],title:"Fine-tuning your own models"},{local:"controlnet",title:"ControlNet"},{local:"prompt-weighting",title:"Prompt Weighting"}],title:"Controlled generation"};function Ff(tt){return Lf(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Of extends qf{constructor(c){super();Gf(this,c,Ff,Mf,jf,{})}}export{Of as default,Hf as metadata}; | |
Xet Storage Details
- Size:
- 45.6 kB
- Xet hash:
- 489ecba7bfb7d481513ae96c205bec806f87c5e0437fd73cf2059ccc389777f1
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.