Buckets:

hf-doc-build
/

doc

hf-doc-build/doc / diffusers /v0.13.0 /en /_app /pages /using-diffusers /controlling_generation.mdx-hf-doc-builder.js

rtrm's picture

about 2 months ago

34.4 kB

	import{S as as,i as rs,s as ns,e as o,k as h,w as x,t as l,M as ss,c as a,d as t,m as u,a as r,x as E,h as f,b as p,G as i,g as s,y,q as P,o as $,B as A,v as ls}from"../../chunks/vendor-hf-doc-builder.js";import{T as fs}from"../../chunks/Tip-hf-doc-builder.js";import{I as S}from"../../chunks/IconCopyLink-hf-doc-builder.js";function ps(Vt){let d,L,m,w;return{c(){d=o("p"),L=l(`Pix2PixZero is the first model that allows \u201C0-shot\u201D image editing. This means that the model
	can edit an image in less than a minute on a consumer GPU as shown `),m=o("a"),w=l("here"),this.h()},l(b){d=a(b,"P",{});var g=r(d);L=f(g,`Pix2PixZero is the first model that allows \u201C0-shot\u201D image editing. This means that the model
	can edit an image in less than a minute on a consumer GPU as shown `),m=a(g,"A",{href:!0});var je=r(m);w=f(je,"here"),je.forEach(t),g.forEach(t),this.h()},h(){p(m,"href","../api/pipelines/stable_diffusion/pix2pix_zero#usage-example")},m(b,g){s(b,d,g),i(d,L),i(d,m),i(m,w)},d(b){b&&t(d)}}}function hs(Vt){let d,L,m,w,b,g,je,yt,co,Xt,De,mo,Yt,ze,vo,ei,Ge,go,ti,_,wo,Pt,_o,bo,ae,xo,Eo,re,yo,Po,ii,Le,$o,oi,He,Ao,ai,Fe,So,ri,c,$t,Be,ko,Io,At,Ce,To,qo,St,Me,No,jo,kt,Ue,Do,zo,It,Ze,Go,Lo,Tt,Oe,Ho,Fo,qt,We,Bo,Co,Nt,Je,Mo,ni,k,H,jt,ne,Uo,Dt,Zo,si,Re,se,Oo,li,le,Ke,Wo,Jo,fi,F,Ro,Qe,Ko,Qo,pi,I,B,zt,fe,Vo,Gt,Xo,hi,Ve,pe,Yo,ui,he,Xe,ea,ta,ci,Ye,ia,di,et,oa,mi,C,ue,aa,ce,ra,na,sa,de,la,me,fa,pa,vi,M,gi,U,ha,tt,ua,ca,wi,T,Z,Lt,ve,da,Ht,ma,_i,it,ge,va,bi,we,ot,ga,wa,xi,at,_a,Ei,O,ba,rt,xa,Ea,yi,q,W,Ft,_e,ya,Bt,Pa,Pi,nt,be,$a,$i,st,Aa,Ai,lt,Sa,Si,J,ka,ft,Ia,Ta,ki,N,R,Ct,xe,qa,Mt,Na,Ii,pt,Ee,ja,Ti,ye,ht,Da,za,qi,ut,Ga,Ni,K,La,ct,Ha,Fa,ji,j,Q,Ut,Pe,Ba,Zt,Ca,Di,dt,$e,Ma,zi,Ae,mt,Ua,Za,Gi,vt,Oa,Li,V,Wa,gt,Ja,Ra,Hi,D,X,Ot,Se,Ka,Wt,Qa,Fi,wt,Va,Bi,z,Y,Jt,ke,Xa,Rt,Ya,Ci,Ie,_t,er,tr,Mi,ee,ir,bt,or,ar,Ui,G,te,Kt,Te,rr,Qt,nr,Zi,qe,xt,sr,lr,Oi,ie,fr,Et,pr,hr,Wi;return g=new S({}),ne=new S({}),fe=new S({}),M=new fs({props:{$$slots:{default:[ps]},$$scope:{ctx:Vt}}}),ve=new S({}),_e=new S({}),xe=new S({}),Pe=new S({}),Se=new S({}),ke=new S({}),Te=new S({}),{c(){d=o("meta"),L=h(),m=o("h1"),w=o("a"),b=o("span"),x(g.$$.fragment),je=h(),yt=o("span"),co=l("Controlling generation of diffusion models"),Xt=h(),De=o("p"),mo=l("Controlling outputs generated by diffusion models has been long pursued by the community and is now an active research topic. In many popular diffusion models, subtle changes in inputs, both images and text prompts, can drastically change outputs. In an ideal world we want to be able to control how semantics are preserved and changed."),Yt=h(),ze=o("p"),vo=l("Most examples of preserving semantics reduce to being able to accurately map a change in input to a change in output. I.e. adding an adjective to a subject in a prompt preserves the entire image, only modifying the changed subject. Or, image variation of a particular subject preserves the subject\u2019s pose."),ei=h(),Ge=o("p"),go=l("Additionally, there are qualities of generated images that we would like to influence beyond semantic preservation. I.e. in general, we would like our outputs to be of good quality, adhere to a particular style, or be realistic."),ti=h(),_=o("p"),wo=l("We will document some of the techniques "),Pt=o("code"),_o=l("diffusers"),bo=l(" supports to control generation of diffusion models. Much is cutting edge research and can be quite nuanced. If something needs clarifying or you have a suggestion, don\u2019t hesitate to open a discussion on the "),ae=o("a"),xo=l("forum"),Eo=l(" or a "),re=o("a"),yo=l("GitHub issue"),Po=l("."),ii=h(),Le=o("p"),$o=l("We provide a high level explanation of how the generation can be controlled as well as a snippet of the technicals. For more in depth explanations on the technicals, the original papers which are linked from the pipelines are always the best resources."),oi=h(),He=o("p"),Ao=l("Depending on the use case, one should choose a technique accordingly. In many cases, these techniques can be combined. For example, one can combine Textual Inversion with SEGA to provide more semantic guidance to the outputs generated using Textual Inversion."),ai=h(),Fe=o("p"),So=l("Unless otherwise mentioned, these are techniques that work with existing models and don\u2019t require their own weights."),ri=h(),c=o("ol"),$t=o("li"),Be=o("a"),ko=l("Instruct Pix2Pix"),Io=h(),At=o("li"),Ce=o("a"),To=l("Pix2Pix 0"),qo=h(),St=o("li"),Me=o("a"),No=l("Attend and excite"),jo=h(),kt=o("li"),Ue=o("a"),Do=l("Semantic guidance"),zo=h(),It=o("li"),Ze=o("a"),Go=l("Self attention guidance"),Lo=h(),Tt=o("li"),Oe=o("a"),Ho=l("Depth2image"),Fo=h(),qt=o("li"),We=o("a"),Bo=l("DreamBooth"),Co=h(),Nt=o("li"),Je=o("a"),Mo=l("Textual Inversion"),ni=h(),k=o("h2"),H=o("a"),jt=o("span"),x(ne.$$.fragment),Uo=h(),Dt=o("span"),Zo=l("Instruct pix2pix"),si=h(),Re=o("p"),se=o("a"),Oo=l("Paper"),li=h(),le=o("p"),Ke=o("a"),Wo=l("Pix2Pix"),Jo=l(` is fine-tuned from stable diffusion to support editing input images. It takes as input an image with a prompt describing an edit, and it outputs the edited image.
	Pix2Pix has been trained to work explicitely well with instructGPT-like prompts.`),fi=h(),F=o("p"),Ro=l("See "),Qe=o("a"),Ko=l("here"),Qo=l(" for more information on how to use it."),pi=h(),I=o("h2"),B=o("a"),zt=o("span"),x(fe.$$.fragment),Vo=h(),Gt=o("span"),Xo=l("Pix2PixZero"),hi=h(),Ve=o("p"),pe=o("a"),Yo=l("Paper"),ui=h(),he=o("p"),Xe=o("a"),ea=l("Pix2Pix-zero"),ta=l(" allows modifying an image from one concept to another while preserving general image semantics."),ci=h(),Ye=o("p"),ia=l("The denoising process is guided from one conceptual embedding towards another conceptual embedding. The intermediate latents are optimized during the denoising process to push the attention maps towards reference attention maps. The reference attention maps are from the denoising process of the input image and are used to encourage semantic preservation."),di=h(),et=o("p"),oa=l("Pix2PixZero can be used both to edit synthetic images as well as real images."),mi=h(),C=o("ul"),ue=o("li"),aa=l(`To edit synthetic images, one first generates on image given a caption.
	Next, for a concept of the caption that shall be edited as well as the new target concept one generates image captions (e.g. with a model like `),ce=o("a"),ra=l("Flan-T5"),na=l("). Then, \u201Cmean\u201D prompt embeddings for both the source and target concepts are created via the text encoder. Finally, the pix2pix-zero algorithm is used to edit the synthetic image."),sa=h(),de=o("li"),la=l("To edit a real image, one first generates an image caption using a model like "),me=o("a"),fa=l("Blip"),pa=l(". Then one applies ddim inversion on the prompt and image to generate \u201Cinverse\u201D latents. Similar to before, \u201Cmean\u201D prompt embeddings for both source and target concepts are created and finally the pix2pix-zero algorithm in combination with the \u201Cinverse\u201D latents is used to edit the image."),vi=h(),x(M.$$.fragment),gi=h(),U=o("p"),ha=l("See "),tt=o("a"),ua=l("here"),ca=l(" for more information on how to use it."),wi=h(),T=o("h2"),Z=o("a"),Lt=o("span"),x(ve.$$.fragment),da=h(),Ht=o("span"),ma=l("Attend and excite"),_i=h(),it=o("p"),ge=o("a"),va=l("Paper"),bi=h(),we=o("p"),ot=o("a"),ga=l("Attend and excite"),wa=l(" allows subjects in the prompt to be faithfully represented in the final image."),xi=h(),at=o("p"),_a=l("A set of token indices are given as input, corresponding to the subjects in the prompt that need to be present in the image. During denoising, each token index is insured to have above a minimum attention threshold for at least one patch of the image. The intermediate latents are iteratively optimized during the denoising process to strengthen the attention of the most neglected subject token until the attention threshold is passed for all subject tokens."),Ei=h(),O=o("p"),ba=l("See "),rt=o("a"),xa=l("here"),Ea=l(" for more information on how to use it."),yi=h(),q=o("h2"),W=o("a"),Ft=o("span"),x(_e.$$.fragment),ya=h(),Bt=o("span"),Pa=l("Semantic guidance"),Pi=h(),nt=o("p"),be=o("a"),$a=l("Paper"),$i=h(),st=o("p"),Aa=l("SEGA allows applying or removing one or more concepts from an image. The strength of the concept can also be controlled. I.e. the smile concept can be used to incrementally increase or decrease the smile of a portrait."),Ai=h(),lt=o("p"),Sa=l("Similar to how classifier free guidance provides guidance via empty prompt inputs, SEGA provides guidance on conceptual prompts. Multiple of these conceptual prompts can be applied simultaneously. Each conceptual prompt can either add or remove their concept depending on if the guidance is applied positively or negatively."),Si=h(),J=o("p"),ka=l("See "),ft=o("a"),Ia=l("here"),Ta=l(" for more information on how to use it."),ki=h(),N=o("h2"),R=o("a"),Ct=o("span"),x(xe.$$.fragment),qa=h(),Mt=o("span"),Na=l("Self attention guidance"),Ii=h(),pt=o("p"),Ee=o("a"),ja=l("Paper"),Ti=h(),ye=o("p"),ht=o("a"),Da=l("Self attention guidance"),za=l(" improves the general quality of images."),qi=h(),ut=o("p"),Ga=l("SAG provides guidance from predictions not conditioned on high frequency details to fully conditioned images. The high frequency details are extracted out of the UNet self-attention maps."),Ni=h(),K=o("p"),La=l("See "),ct=o("a"),Ha=l("here"),Fa=l(" for more information on how to use it."),ji=h(),j=o("h2"),Q=o("a"),Ut=o("span"),x(Pe.$$.fragment),Ba=h(),Zt=o("span"),Ca=l("Depth2image"),Di=h(),dt=o("p"),$e=o("a"),Ma=l("Paper"),zi=h(),Ae=o("p"),mt=o("a"),Ua=l("Depth2image"),Za=l(" is fine-tuned from stable diffusion to better preserve semantics for text guided image variation."),Gi=h(),vt=o("p"),Oa=l("It conditions on a monocular depth estimate of the original image."),Li=h(),V=o("p"),Wa=l("See "),gt=o("a"),Ja=l("here"),Ra=l(" for more information on how to use it."),Hi=h(),D=o("h3"),X=o("a"),Ot=o("span"),x(Se.$$.fragment),Ka=h(),Wt=o("span"),Qa=l("Fine-tuning methods"),Fi=h(),wt=o("p"),Va=l("In addition to pre-trained models, diffusers has training scripts for fine-tuning models on user provided data."),Bi=h(),z=o("h2"),Y=o("a"),Jt=o("span"),x(ke.$$.fragment),Xa=h(),Rt=o("span"),Ya=l("DreamBooth"),Ci=h(),Ie=o("p"),_t=o("a"),er=l("DreamBooth"),tr=l(" fine-tunes a model to teach it about a new subject. I.e. a few pictures of a person can be used to generate images of that person in different styles."),Mi=h(),ee=o("p"),ir=l("See "),bt=o("a"),or=l("here"),ar=l(" for more information on how to use it."),Ui=h(),G=o("h2"),te=o("a"),Kt=o("span"),x(Te.$$.fragment),rr=h(),Qt=o("span"),nr=l("Textual Inversion"),Zi=h(),qe=o("p"),xt=o("a"),sr=l("Textual Inversion"),lr=l(" fine-tunes a model to teach it about a new concept. I.e. a few pictures of a style of artwork can be used to generate images in that style."),Oi=h(),ie=o("p"),fr=l("See "),Et=o("a"),pr=l("here"),hr=l(" for more information on how to use it."),this.h()},l(e){const n=ss('[data-svelte="svelte-1phssyn"]',document.head);d=a(n,"META",{name:!0,content:!0}),n.forEach(t),L=u(e),m=a(e,"H1",{class:!0});var Ne=r(m);w=a(Ne,"A",{id:!0,class:!0,href:!0});var _r=r(w);b=a(_r,"SPAN",{});var br=r(b);E(g.$$.fragment,br),br.forEach(t),_r.forEach(t),je=u(Ne),yt=a(Ne,"SPAN",{});var xr=r(yt);co=f(xr,"Controlling generation of diffusion models"),xr.forEach(t),Ne.forEach(t),Xt=u(e),De=a(e,"P",{});var Er=r(De);mo=f(Er,"Controlling outputs generated by diffusion models has been long pursued by the community and is now an active research topic. In many popular diffusion models, subtle changes in inputs, both images and text prompts, can drastically change outputs. In an ideal world we want to be able to control how semantics are preserved and changed."),Er.forEach(t),Yt=u(e),ze=a(e,"P",{});var yr=r(ze);vo=f(yr,"Most examples of preserving semantics reduce to being able to accurately map a change in input to a change in output. I.e. adding an adjective to a subject in a prompt preserves the entire image, only modifying the changed subject. Or, image variation of a particular subject preserves the subject\u2019s pose."),yr.forEach(t),ei=u(e),Ge=a(e,"P",{});var Pr=r(Ge);go=f(Pr,"Additionally, there are qualities of generated images that we would like to influence beyond semantic preservation. I.e. in general, we would like our outputs to be of good quality, adhere to a particular style, or be realistic."),Pr.forEach(t),ti=u(e),_=a(e,"P",{});var oe=r(_);wo=f(oe,"We will document some of the techniques "),Pt=a(oe,"CODE",{});var $r=r(Pt);_o=f($r,"diffusers"),$r.forEach(t),bo=f(oe," supports to control generation of diffusion models. Much is cutting edge research and can be quite nuanced. If something needs clarifying or you have a suggestion, don\u2019t hesitate to open a discussion on the "),ae=a(oe,"A",{href:!0,rel:!0});var Ar=r(ae);xo=f(Ar,"forum"),Ar.forEach(t),Eo=f(oe," or a "),re=a(oe,"A",{href:!0,rel:!0});var Sr=r(re);yo=f(Sr,"GitHub issue"),Sr.forEach(t),Po=f(oe,"."),oe.forEach(t),ii=u(e),Le=a(e,"P",{});var kr=r(Le);$o=f(kr,"We provide a high level explanation of how the generation can be controlled as well as a snippet of the technicals. For more in depth explanations on the technicals, the original papers which are linked from the pipelines are always the best resources."),kr.forEach(t),oi=u(e),He=a(e,"P",{});var Ir=r(He);Ao=f(Ir,"Depending on the use case, one should choose a technique accordingly. In many cases, these techniques can be combined. For example, one can combine Textual Inversion with SEGA to provide more semantic guidance to the outputs generated using Textual Inversion."),Ir.forEach(t),ai=u(e),Fe=a(e,"P",{});var Tr=r(Fe);So=f(Tr,"Unless otherwise mentioned, these are techniques that work with existing models and don\u2019t require their own weights."),Tr.forEach(t),ri=u(e),c=a(e,"OL",{});var v=r(c);$t=a(v,"LI",{});var qr=r($t);Be=a(qr,"A",{href:!0});var Nr=r(Be);ko=f(Nr,"Instruct Pix2Pix"),Nr.forEach(t),qr.forEach(t),Io=u(v),At=a(v,"LI",{});var jr=r(At);Ce=a(jr,"A",{href:!0});var Dr=r(Ce);To=f(Dr,"Pix2Pix 0"),Dr.forEach(t),jr.forEach(t),qo=u(v),St=a(v,"LI",{});var zr=r(St);Me=a(zr,"A",{href:!0});var Gr=r(Me);No=f(Gr,"Attend and excite"),Gr.forEach(t),zr.forEach(t),jo=u(v),kt=a(v,"LI",{});var Lr=r(kt);Ue=a(Lr,"A",{href:!0});var Hr=r(Ue);Do=f(Hr,"Semantic guidance"),Hr.forEach(t),Lr.forEach(t),zo=u(v),It=a(v,"LI",{});var Fr=r(It);Ze=a(Fr,"A",{href:!0});var Br=r(Ze);Go=f(Br,"Self attention guidance"),Br.forEach(t),Fr.forEach(t),Lo=u(v),Tt=a(v,"LI",{});var Cr=r(Tt);Oe=a(Cr,"A",{href:!0});var Mr=r(Oe);Ho=f(Mr,"Depth2image"),Mr.forEach(t),Cr.forEach(t),Fo=u(v),qt=a(v,"LI",{});var Ur=r(qt);We=a(Ur,"A",{href:!0});var Zr=r(We);Bo=f(Zr,"DreamBooth"),Zr.forEach(t),Ur.forEach(t),Co=u(v),Nt=a(v,"LI",{});var Or=r(Nt);Je=a(Or,"A",{href:!0});var Wr=r(Je);Mo=f(Wr,"Textual Inversion"),Wr.forEach(t),Or.forEach(t),v.forEach(t),ni=u(e),k=a(e,"H2",{class:!0});var Ji=r(k);H=a(Ji,"A",{id:!0,class:!0,href:!0});var Jr=r(H);jt=a(Jr,"SPAN",{});var Rr=r(jt);E(ne.$$.fragment,Rr),Rr.forEach(t),Jr.forEach(t),Uo=u(Ji),Dt=a(Ji,"SPAN",{});var Kr=r(Dt);Zo=f(Kr,"Instruct pix2pix"),Kr.forEach(t),Ji.forEach(t),si=u(e),Re=a(e,"P",{});var Qr=r(Re);se=a(Qr,"A",{href:!0,rel:!0});var Vr=r(se);Oo=f(Vr,"Paper"),Vr.forEach(t),Qr.forEach(t),li=u(e),le=a(e,"P",{});var ur=r(le);Ke=a(ur,"A",{href:!0});var Xr=r(Ke);Wo=f(Xr,"Pix2Pix"),Xr.forEach(t),Jo=f(ur,` is fine-tuned from stable diffusion to support editing input images. It takes as input an image with a prompt describing an edit, and it outputs the edited image.
	Pix2Pix has been trained to work explicitely well with instructGPT-like prompts.`),ur.forEach(t),fi=u(e),F=a(e,"P",{});var Ri=r(F);Ro=f(Ri,"See "),Qe=a(Ri,"A",{href:!0});var Yr=r(Qe);Ko=f(Yr,"here"),Yr.forEach(t),Qo=f(Ri," for more information on how to use it."),Ri.forEach(t),pi=u(e),I=a(e,"H2",{class:!0});var Ki=r(I);B=a(Ki,"A",{id:!0,class:!0,href:!0});var en=r(B);zt=a(en,"SPAN",{});var tn=r(zt);E(fe.$$.fragment,tn),tn.forEach(t),en.forEach(t),Vo=u(Ki),Gt=a(Ki,"SPAN",{});var on=r(Gt);Xo=f(on,"Pix2PixZero"),on.forEach(t),Ki.forEach(t),hi=u(e),Ve=a(e,"P",{});var an=r(Ve);pe=a(an,"A",{href:!0,rel:!0});var rn=r(pe);Yo=f(rn,"Paper"),rn.forEach(t),an.forEach(t),ui=u(e),he=a(e,"P",{});var cr=r(he);Xe=a(cr,"A",{href:!0});var nn=r(Xe);ea=f(nn,"Pix2Pix-zero"),nn.forEach(t),ta=f(cr," allows modifying an image from one concept to another while preserving general image semantics."),cr.forEach(t),ci=u(e),Ye=a(e,"P",{});var sn=r(Ye);ia=f(sn,"The denoising process is guided from one conceptual embedding towards another conceptual embedding. The intermediate latents are optimized during the denoising process to push the attention maps towards reference attention maps. The reference attention maps are from the denoising process of the input image and are used to encourage semantic preservation."),sn.forEach(t),di=u(e),et=a(e,"P",{});var ln=r(et);oa=f(ln,"Pix2PixZero can be used both to edit synthetic images as well as real images."),ln.forEach(t),mi=u(e),C=a(e,"UL",{});var Qi=r(C);ue=a(Qi,"LI",{});var Vi=r(ue);aa=f(Vi,`To edit synthetic images, one first generates on image given a caption.
	Next, for a concept of the caption that shall be edited as well as the new target concept one generates image captions (e.g. with a model like `),ce=a(Vi,"A",{href:!0,rel:!0});var fn=r(ce);ra=f(fn,"Flan-T5"),fn.forEach(t),na=f(Vi,"). Then, \u201Cmean\u201D prompt embeddings for both the source and target concepts are created via the text encoder. Finally, the pix2pix-zero algorithm is used to edit the synthetic image."),Vi.forEach(t),sa=u(Qi),de=a(Qi,"LI",{});var Xi=r(de);la=f(Xi,"To edit a real image, one first generates an image caption using a model like "),me=a(Xi,"A",{href:!0,rel:!0});var pn=r(me);fa=f(pn,"Blip"),pn.forEach(t),pa=f(Xi,". Then one applies ddim inversion on the prompt and image to generate \u201Cinverse\u201D latents. Similar to before, \u201Cmean\u201D prompt embeddings for both source and target concepts are created and finally the pix2pix-zero algorithm in combination with the \u201Cinverse\u201D latents is used to edit the image."),Xi.forEach(t),Qi.forEach(t),vi=u(e),E(M.$$.fragment,e),gi=u(e),U=a(e,"P",{});var Yi=r(U);ha=f(Yi,"See "),tt=a(Yi,"A",{href:!0});var hn=r(tt);ua=f(hn,"here"),hn.forEach(t),ca=f(Yi," for more information on how to use it."),Yi.forEach(t),wi=u(e),T=a(e,"H2",{class:!0});var eo=r(T);Z=a(eo,"A",{id:!0,class:!0,href:!0});var un=r(Z);Lt=a(un,"SPAN",{});var cn=r(Lt);E(ve.$$.fragment,cn),cn.forEach(t),un.forEach(t),da=u(eo),Ht=a(eo,"SPAN",{});var dn=r(Ht);ma=f(dn,"Attend and excite"),dn.forEach(t),eo.forEach(t),_i=u(e),it=a(e,"P",{});var mn=r(it);ge=a(mn,"A",{href:!0,rel:!0});var vn=r(ge);va=f(vn,"Paper"),vn.forEach(t),mn.forEach(t),bi=u(e),we=a(e,"P",{});var dr=r(we);ot=a(dr,"A",{href:!0});var gn=r(ot);ga=f(gn,"Attend and excite"),gn.forEach(t),wa=f(dr," allows subjects in the prompt to be faithfully represented in the final image."),dr.forEach(t),xi=u(e),at=a(e,"P",{});var wn=r(at);_a=f(wn,"A set of token indices are given as input, corresponding to the subjects in the prompt that need to be present in the image. During denoising, each token index is insured to have above a minimum attention threshold for at least one patch of the image. The intermediate latents are iteratively optimized during the denoising process to strengthen the attention of the most neglected subject token until the attention threshold is passed for all subject tokens."),wn.forEach(t),Ei=u(e),O=a(e,"P",{});var to=r(O);ba=f(to,"See "),rt=a(to,"A",{href:!0});var _n=r(rt);xa=f(_n,"here"),_n.forEach(t),Ea=f(to," for more information on how to use it."),to.forEach(t),yi=u(e),q=a(e,"H2",{class:!0});var io=r(q);W=a(io,"A",{id:!0,class:!0,href:!0});var bn=r(W);Ft=a(bn,"SPAN",{});var xn=r(Ft);E(_e.$$.fragment,xn),xn.forEach(t),bn.forEach(t),ya=u(io),Bt=a(io,"SPAN",{});var En=r(Bt);Pa=f(En,"Semantic guidance"),En.forEach(t),io.forEach(t),Pi=u(e),nt=a(e,"P",{});var yn=r(nt);be=a(yn,"A",{href:!0,rel:!0});var Pn=r(be);$a=f(Pn,"Paper"),Pn.forEach(t),yn.forEach(t),$i=u(e),st=a(e,"P",{});var $n=r(st);Aa=f($n,"SEGA allows applying or removing one or more concepts from an image. The strength of the concept can also be controlled. I.e. the smile concept can be used to incrementally increase or decrease the smile of a portrait."),$n.forEach(t),Ai=u(e),lt=a(e,"P",{});var An=r(lt);Sa=f(An,"Similar to how classifier free guidance provides guidance via empty prompt inputs, SEGA provides guidance on conceptual prompts. Multiple of these conceptual prompts can be applied simultaneously. Each conceptual prompt can either add or remove their concept depending on if the guidance is applied positively or negatively."),An.forEach(t),Si=u(e),J=a(e,"P",{});var oo=r(J);ka=f(oo,"See "),ft=a(oo,"A",{href:!0});var Sn=r(ft);Ia=f(Sn,"here"),Sn.forEach(t),Ta=f(oo," for more information on how to use it."),oo.forEach(t),ki=u(e),N=a(e,"H2",{class:!0});var ao=r(N);R=a(ao,"A",{id:!0,class:!0,href:!0});var kn=r(R);Ct=a(kn,"SPAN",{});var In=r(Ct);E(xe.$$.fragment,In),In.forEach(t),kn.forEach(t),qa=u(ao),Mt=a(ao,"SPAN",{});var Tn=r(Mt);Na=f(Tn,"Self attention guidance"),Tn.forEach(t),ao.forEach(t),Ii=u(e),pt=a(e,"P",{});var qn=r(pt);Ee=a(qn,"A",{href:!0,rel:!0});var Nn=r(Ee);ja=f(Nn,"Paper"),Nn.forEach(t),qn.forEach(t),Ti=u(e),ye=a(e,"P",{});var mr=r(ye);ht=a(mr,"A",{href:!0});var jn=r(ht);Da=f(jn,"Self attention guidance"),jn.forEach(t),za=f(mr," improves the general quality of images."),mr.forEach(t),qi=u(e),ut=a(e,"P",{});var Dn=r(ut);Ga=f(Dn,"SAG provides guidance from predictions not conditioned on high frequency details to fully conditioned images. The high frequency details are extracted out of the UNet self-attention maps."),Dn.forEach(t),Ni=u(e),K=a(e,"P",{});var ro=r(K);La=f(ro,"See "),ct=a(ro,"A",{href:!0});var zn=r(ct);Ha=f(zn,"here"),zn.forEach(t),Fa=f(ro," for more information on how to use it."),ro.forEach(t),ji=u(e),j=a(e,"H2",{class:!0});var no=r(j);Q=a(no,"A",{id:!0,class:!0,href:!0});var Gn=r(Q);Ut=a(Gn,"SPAN",{});var Ln=r(Ut);E(Pe.$$.fragment,Ln),Ln.forEach(t),Gn.forEach(t),Ba=u(no),Zt=a(no,"SPAN",{});var Hn=r(Zt);Ca=f(Hn,"Depth2image"),Hn.forEach(t),no.forEach(t),Di=u(e),dt=a(e,"P",{});var Fn=r(dt);$e=a(Fn,"A",{href:!0,rel:!0});var Bn=r($e);Ma=f(Bn,"Paper"),Bn.forEach(t),Fn.forEach(t),zi=u(e),Ae=a(e,"P",{});var vr=r(Ae);mt=a(vr,"A",{href:!0});var Cn=r(mt);Ua=f(Cn,"Depth2image"),Cn.forEach(t),Za=f(vr," is fine-tuned from stable diffusion to better preserve semantics for text guided image variation."),vr.forEach(t),Gi=u(e),vt=a(e,"P",{});var Mn=r(vt);Oa=f(Mn,"It conditions on a monocular depth estimate of the original image."),Mn.forEach(t),Li=u(e),V=a(e,"P",{});var so=r(V);Wa=f(so,"See "),gt=a(so,"A",{href:!0});var Un=r(gt);Ja=f(Un,"here"),Un.forEach(t),Ra=f(so," for more information on how to use it."),so.forEach(t),Hi=u(e),D=a(e,"H3",{class:!0});var lo=r(D);X=a(lo,"A",{id:!0,class:!0,href:!0});var Zn=r(X);Ot=a(Zn,"SPAN",{});var On=r(Ot);E(Se.$$.fragment,On),On.forEach(t),Zn.forEach(t),Ka=u(lo),Wt=a(lo,"SPAN",{});var Wn=r(Wt);Qa=f(Wn,"Fine-tuning methods"),Wn.forEach(t),lo.forEach(t),Fi=u(e),wt=a(e,"P",{});var Jn=r(wt);Va=f(Jn,"In addition to pre-trained models, diffusers has training scripts for fine-tuning models on user provided data."),Jn.forEach(t),Bi=u(e),z=a(e,"H2",{class:!0});var fo=r(z);Y=a(fo,"A",{id:!0,class:!0,href:!0});var Rn=r(Y);Jt=a(Rn,"SPAN",{});var Kn=r(Jt);E(ke.$$.fragment,Kn),Kn.forEach(t),Rn.forEach(t),Xa=u(fo),Rt=a(fo,"SPAN",{});var Qn=r(Rt);Ya=f(Qn,"DreamBooth"),Qn.forEach(t),fo.forEach(t),Ci=u(e),Ie=a(e,"P",{});var gr=r(Ie);_t=a(gr,"A",{href:!0});var Vn=r(_t);er=f(Vn,"DreamBooth"),Vn.forEach(t),tr=f(gr," fine-tunes a model to teach it about a new subject. I.e. a few pictures of a person can be used to generate images of that person in different styles."),gr.forEach(t),Mi=u(e),ee=a(e,"P",{});var po=r(ee);ir=f(po,"See "),bt=a(po,"A",{href:!0});var Xn=r(bt);or=f(Xn,"here"),Xn.forEach(t),ar=f(po," for more information on how to use it."),po.forEach(t),Ui=u(e),G=a(e,"H2",{class:!0});var ho=r(G);te=a(ho,"A",{id:!0,class:!0,href:!0});var Yn=r(te);Kt=a(Yn,"SPAN",{});var es=r(Kt);E(Te.$$.fragment,es),es.forEach(t),Yn.forEach(t),rr=u(ho),Qt=a(ho,"SPAN",{});var ts=r(Qt);nr=f(ts,"Textual Inversion"),ts.forEach(t),ho.forEach(t),Zi=u(e),qe=a(e,"P",{});var wr=r(qe);xt=a(wr,"A",{href:!0});var is=r(xt);sr=f(is,"Textual Inversion"),is.forEach(t),lr=f(wr," fine-tunes a model to teach it about a new concept. I.e. a few pictures of a style of artwork can be used to generate images in that style."),wr.forEach(t),Oi=u(e),ie=a(e,"P",{});var uo=r(ie);fr=f(uo,"See "),Et=a(uo,"A",{href:!0});var os=r(Et);pr=f(os,"here"),os.forEach(t),hr=f(uo," for more information on how to use it."),uo.forEach(t),this.h()},h(){p(d,"name","hf:doc:metadata"),p(d,"content",JSON.stringify(us)),p(w,"id","controlling-generation-of-diffusion-models"),p(w,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(w,"href","#controlling-generation-of-diffusion-models"),p(m,"class","relative group"),p(ae,"href","https://discuss.huggingface.co/"),p(ae,"rel","nofollow"),p(re,"href","https://github.com/huggingface/diffusers/issues"),p(re,"rel","nofollow"),p(Be,"href","#instruct-pix2pix"),p(Ce,"href","#pix2pixzero"),p(Me,"href","#attend-and-excite"),p(Ue,"href","#semantic-guidance"),p(Ze,"href","#self-attention-guidance"),p(Oe,"href","#depth2image"),p(We,"href","#dreambooth"),p(Je,"href","#textual-inversion"),p(H,"id","instruct-pix2pix"),p(H,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(H,"href","#instruct-pix2pix"),p(k,"class","relative group"),p(se,"href","https://github.com/timothybrooks/instruct-pix2pix"),p(se,"rel","nofollow"),p(Ke,"href","../api/pipelines/stable_diffusion/pix2pix"),p(Qe,"href","../api/pipelines/stable_diffusion/pix2pix"),p(B,"id","pix2pixzero"),p(B,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(B,"href","#pix2pixzero"),p(I,"class","relative group"),p(pe,"href","https://pix2pixzero.github.io/"),p(pe,"rel","nofollow"),p(Xe,"href","../api/pipelines/stable_diffusion/pix2pix_zero"),p(ce,"href","https://huggingface.co/docs/transformers/model_doc/flan-t5"),p(ce,"rel","nofollow"),p(me,"href","https://huggingface.co/docs/transformers/model_doc/blip"),p(me,"rel","nofollow"),p(tt,"href","../api/pipelines/stable_diffusion/pix2pix_zero"),p(Z,"id","attend-and-excite"),p(Z,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(Z,"href","#attend-and-excite"),p(T,"class","relative group"),p(ge,"href","https://attendandexcite.github.io/Attend-and-Excite/"),p(ge,"rel","nofollow"),p(ot,"href","../api/pipelines/stable_diffusion/attend_and_excite"),p(rt,"href","../api/pipelines/stable_diffusion/attend_and_excite"),p(W,"id","semantic-guidance"),p(W,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(W,"href","#semantic-guidance"),p(q,"class","relative group"),p(be,"href","https://arxiv.org/abs/2301.12247"),p(be,"rel","nofollow"),p(ft,"href","../api/pipelines/semantic_stable_diffusion"),p(R,"id","self-attention-guidance"),p(R,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(R,"href","#self-attention-guidance"),p(N,"class","relative group"),p(Ee,"href","https://arxiv.org/abs/2210.00939"),p(Ee,"rel","nofollow"),p(ht,"href","../api/pipelines/stable_diffusion/self_attention_guidance"),p(ct,"href","../api/pipelines/stable_diffusion/self_attention_guidance"),p(Q,"id","depth2image"),p(Q,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(Q,"href","#depth2image"),p(j,"class","relative group"),p($e,"href","https://huggingface.co/stabilityai/stable-diffusion-2-depth"),p($e,"rel","nofollow"),p(mt,"href","../pipelines/stable_diffusion_2#depthtoimage"),p(gt,"href","../api/pipelines/stable_diffusion_2#depthtoimage"),p(X,"id","finetuning-methods"),p(X,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(X,"href","#finetuning-methods"),p(D,"class","relative group"),p(Y,"id","dreambooth"),p(Y,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(Y,"href","#dreambooth"),p(z,"class","relative group"),p(_t,"href","../training/dreambooth"),p(bt,"href","../training/dreambooth"),p(te,"id","textual-inversion"),p(te,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),p(te,"href","#textual-inversion"),p(G,"class","relative group"),p(xt,"href","../training/text_inversion"),p(Et,"href","../training/text_inversion")},m(e,n){i(document.head,d),s(e,L,n),s(e,m,n),i(m,w),i(w,b),y(g,b,null),i(m,je),i(m,yt),i(yt,co),s(e,Xt,n),s(e,De,n),i(De,mo),s(e,Yt,n),s(e,ze,n),i(ze,vo),s(e,ei,n),s(e,Ge,n),i(Ge,go),s(e,ti,n),s(e,_,n),i(_,wo),i(_,Pt),i(Pt,_o),i(_,bo),i(_,ae),i(ae,xo),i(_,Eo),i(_,re),i(re,yo),i(_,Po),s(e,ii,n),s(e,Le,n),i(Le,$o),s(e,oi,n),s(e,He,n),i(He,Ao),s(e,ai,n),s(e,Fe,n),i(Fe,So),s(e,ri,n),s(e,c,n),i(c,$t),i($t,Be),i(Be,ko),i(c,Io),i(c,At),i(At,Ce),i(Ce,To),i(c,qo),i(c,St),i(St,Me),i(Me,No),i(c,jo),i(c,kt),i(kt,Ue),i(Ue,Do),i(c,zo),i(c,It),i(It,Ze),i(Ze,Go),i(c,Lo),i(c,Tt),i(Tt,Oe),i(Oe,Ho),i(c,Fo),i(c,qt),i(qt,We),i(We,Bo),i(c,Co),i(c,Nt),i(Nt,Je),i(Je,Mo),s(e,ni,n),s(e,k,n),i(k,H),i(H,jt),y(ne,jt,null),i(k,Uo),i(k,Dt),i(Dt,Zo),s(e,si,n),s(e,Re,n),i(Re,se),i(se,Oo),s(e,li,n),s(e,le,n),i(le,Ke),i(Ke,Wo),i(le,Jo),s(e,fi,n),s(e,F,n),i(F,Ro),i(F,Qe),i(Qe,Ko),i(F,Qo),s(e,pi,n),s(e,I,n),i(I,B),i(B,zt),y(fe,zt,null),i(I,Vo),i(I,Gt),i(Gt,Xo),s(e,hi,n),s(e,Ve,n),i(Ve,pe),i(pe,Yo),s(e,ui,n),s(e,he,n),i(he,Xe),i(Xe,ea),i(he,ta),s(e,ci,n),s(e,Ye,n),i(Ye,ia),s(e,di,n),s(e,et,n),i(et,oa),s(e,mi,n),s(e,C,n),i(C,ue),i(ue,aa),i(ue,ce),i(ce,ra),i(ue,na),i(C,sa),i(C,de),i(de,la),i(de,me),i(me,fa),i(de,pa),s(e,vi,n),y(M,e,n),s(e,gi,n),s(e,U,n),i(U,ha),i(U,tt),i(tt,ua),i(U,ca),s(e,wi,n),s(e,T,n),i(T,Z),i(Z,Lt),y(ve,Lt,null),i(T,da),i(T,Ht),i(Ht,ma),s(e,_i,n),s(e,it,n),i(it,ge),i(ge,va),s(e,bi,n),s(e,we,n),i(we,ot),i(ot,ga),i(we,wa),s(e,xi,n),s(e,at,n),i(at,_a),s(e,Ei,n),s(e,O,n),i(O,ba),i(O,rt),i(rt,xa),i(O,Ea),s(e,yi,n),s(e,q,n),i(q,W),i(W,Ft),y(_e,Ft,null),i(q,ya),i(q,Bt),i(Bt,Pa),s(e,Pi,n),s(e,nt,n),i(nt,be),i(be,$a),s(e,$i,n),s(e,st,n),i(st,Aa),s(e,Ai,n),s(e,lt,n),i(lt,Sa),s(e,Si,n),s(e,J,n),i(J,ka),i(J,ft),i(ft,Ia),i(J,Ta),s(e,ki,n),s(e,N,n),i(N,R),i(R,Ct),y(xe,Ct,null),i(N,qa),i(N,Mt),i(Mt,Na),s(e,Ii,n),s(e,pt,n),i(pt,Ee),i(Ee,ja),s(e,Ti,n),s(e,ye,n),i(ye,ht),i(ht,Da),i(ye,za),s(e,qi,n),s(e,ut,n),i(ut,Ga),s(e,Ni,n),s(e,K,n),i(K,La),i(K,ct),i(ct,Ha),i(K,Fa),s(e,ji,n),s(e,j,n),i(j,Q),i(Q,Ut),y(Pe,Ut,null),i(j,Ba),i(j,Zt),i(Zt,Ca),s(e,Di,n),s(e,dt,n),i(dt,$e),i($e,Ma),s(e,zi,n),s(e,Ae,n),i(Ae,mt),i(mt,Ua),i(Ae,Za),s(e,Gi,n),s(e,vt,n),i(vt,Oa),s(e,Li,n),s(e,V,n),i(V,Wa),i(V,gt),i(gt,Ja),i(V,Ra),s(e,Hi,n),s(e,D,n),i(D,X),i(X,Ot),y(Se,Ot,null),i(D,Ka),i(D,Wt),i(Wt,Qa),s(e,Fi,n),s(e,wt,n),i(wt,Va),s(e,Bi,n),s(e,z,n),i(z,Y),i(Y,Jt),y(ke,Jt,null),i(z,Xa),i(z,Rt),i(Rt,Ya),s(e,Ci,n),s(e,Ie,n),i(Ie,_t),i(_t,er),i(Ie,tr),s(e,Mi,n),s(e,ee,n),i(ee,ir),i(ee,bt),i(bt,or),i(ee,ar),s(e,Ui,n),s(e,G,n),i(G,te),i(te,Kt),y(Te,Kt,null),i(G,rr),i(G,Qt),i(Qt,nr),s(e,Zi,n),s(e,qe,n),i(qe,xt),i(xt,sr),i(qe,lr),s(e,Oi,n),s(e,ie,n),i(ie,fr),i(ie,Et),i(Et,pr),i(ie,hr),Wi=!0},p(e,[n]){const Ne={};n&2&&(Ne.$$scope={dirty:n,ctx:e}),M.$set(Ne)},i(e){Wi\|\|(P(g.$$.fragment,e),P(ne.$$.fragment,e),P(fe.$$.fragment,e),P(M.$$.fragment,e),P(ve.$$.fragment,e),P(_e.$$.fragment,e),P(xe.$$.fragment,e),P(Pe.$$.fragment,e),P(Se.$$.fragment,e),P(ke.$$.fragment,e),P(Te.$$.fragment,e),Wi=!0)},o(e){$(g.$$.fragment,e),$(ne.$$.fragment,e),$(fe.$$.fragment,e),$(M.$$.fragment,e),$(ve.$$.fragment,e),$(_e.$$.fragment,e),$(xe.$$.fragment,e),$(Pe.$$.fragment,e),$(Se.$$.fragment,e),$(ke.$$.fragment,e),$(Te.$$.fragment,e),Wi=!1},d(e){t(d),e&&t(L),e&&t(m),A(g),e&&t(Xt),e&&t(De),e&&t(Yt),e&&t(ze),e&&t(ei),e&&t(Ge),e&&t(ti),e&&t(_),e&&t(ii),e&&t(Le),e&&t(oi),e&&t(He),e&&t(ai),e&&t(Fe),e&&t(ri),e&&t(c),e&&t(ni),e&&t(k),A(ne),e&&t(si),e&&t(Re),e&&t(li),e&&t(le),e&&t(fi),e&&t(F),e&&t(pi),e&&t(I),A(fe),e&&t(hi),e&&t(Ve),e&&t(ui),e&&t(he),e&&t(ci),e&&t(Ye),e&&t(di),e&&t(et),e&&t(mi),e&&t(C),e&&t(vi),A(M,e),e&&t(gi),e&&t(U),e&&t(wi),e&&t(T),A(ve),e&&t(_i),e&&t(it),e&&t(bi),e&&t(we),e&&t(xi),e&&t(at),e&&t(Ei),e&&t(O),e&&t(yi),e&&t(q),A(_e),e&&t(Pi),e&&t(nt),e&&t($i),e&&t(st),e&&t(Ai),e&&t(lt),e&&t(Si),e&&t(J),e&&t(ki),e&&t(N),A(xe),e&&t(Ii),e&&t(pt),e&&t(Ti),e&&t(ye),e&&t(qi),e&&t(ut),e&&t(Ni),e&&t(K),e&&t(ji),e&&t(j),A(Pe),e&&t(Di),e&&t(dt),e&&t(zi),e&&t(Ae),e&&t(Gi),e&&t(vt),e&&t(Li),e&&t(V),e&&t(Hi),e&&t(D),A(Se),e&&t(Fi),e&&t(wt),e&&t(Bi),e&&t(z),A(ke),e&&t(Ci),e&&t(Ie),e&&t(Mi),e&&t(ee),e&&t(Ui),e&&t(G),A(Te),e&&t(Zi),e&&t(qe),e&&t(Oi),e&&t(ie)}}}const us={local:"controlling-generation-of-diffusion-models",sections:[{local:"instruct-pix2pix",title:"Instruct pix2pix"},{local:"pix2pixzero",title:"Pix2PixZero"},{local:"attend-and-excite",title:"Attend and excite"},{local:"semantic-guidance",title:"Semantic guidance"},{local:"self-attention-guidance",title:"Self attention guidance"},{local:"depth2image",sections:[{local:"finetuning-methods",title:"Fine-tuning methods"}],title:"Depth2image"},{local:"dreambooth",title:"DreamBooth"},{local:"textual-inversion",title:"Textual Inversion"}],title:"Controlling generation of diffusion models"};function cs(Vt){return ls(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class gs extends as{constructor(d){super();rs(this,d,cs,hs,ns,{})}}export{gs as default,us as metadata};

Xet Storage Details

Size:: 34.4 kB
Xet hash:: e6dbec856c6649d249eda0fc63c67be684f8904c9fbd896f0102087171791047

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.