Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / diffusers /pr_12509 /en /_app /immutable /nodes /175.6fd5a5bf.js

rtrm's picture

about 1 month ago

11.5 kB

	import{s as $e,n as Pe,o as ye}from"../chunks/scheduler.53228c21.js";import{S as we,i as De,e as s,s as o,c as m,h as xe,a as r,d as i,b as a,f as oe,g as c,j as u,k as j,l as I,m as n,n as g,t as h,o as b,p as v}from"../chunks/index.100fac89.js";import{C as Le}from"../chunks/CopyLLMTxtMenu.c36f1912.js";import{D as _e}from"../chunks/Docstring.00e63d45.js";import{H as ae,E as Ve}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.c6997d0b.js";function Se(fe){let f,U,A,q,_,B,$,K,P,de='Stable Video Diffusion was proposed in <a href="https://hf.co/papers/2311.15127" rel="nofollow">Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets</a> by Andreas Blattmann, Tim Dockhorn, Sumith Kulal, Daniel Mendelevitch, Maciej Kilian, Dominik Lorenz, Yam Levi, Zion English, Vikram Voleti, Adam Letts, Varun Jampani, Robin Rombach.',W,y,ue="The abstract from the paper is:",R,w,pe="<em>We present Stable Video Diffusion - a latent video diffusion model for high-resolution, state-of-the-art text-to-video and image-to-video generation. Recently, latent diffusion models trained for 2D image synthesis have been turned into generative video models by inserting temporal layers and finetuning them on small, high-quality video datasets. However, training methods in the literature vary widely, and the field has yet to agree on a unified strategy for curating video data. In this paper, we identify and evaluate three different stages for successful training of video LDMs: text-to-image pretraining, video pretraining, and high-quality video finetuning. Furthermore, we demonstrate the necessity of a well-curated pretraining dataset for generating high-quality videos and present a systematic curation process to train a strong base model, including captioning and filtering strategies. We then explore the impact of finetuning our base model on high-quality data and train a text-to-video model that is competitive with closed-source video generation. We also show that our base model provides a powerful motion representation for downstream tasks such as image-to-video generation and adaptability to camera motion-specific LoRA modules. Finally, we demonstrate that our model provides a strong multi-view 3D-prior and can serve as a base to finetune a multi-view diffusion model that jointly generates multiple views of objects in a feedforward fashion, outperforming image-based methods at a fraction of their compute budget. We release code and model weights at this https URL.</em>",N,p,me='<p>To learn how to use Stable Video Diffusion, take a look at the <a href="../../../using-diffusers/svd">Stable Video Diffusion</a> guide.</p> <br/> <p>Check out the <a href="https://huggingface.co/stabilityai" rel="nofollow">Stability AI</a> Hub organization for the <a href="https://huggingface.co/stabilityai/stable-video-diffusion-img2vid" rel="nofollow">base</a> and <a href="https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt" rel="nofollow">extended frame</a> checkpoints!</p>',F,D,G,x,ce="Video generation is memory-intensive and one way to reduce your memory usage is to set <code>enable_forward_chunking</code> on the pipeline’s UNet so you don’t run the entire feedforward layer at once. Breaking it up into chunks in a loop is more efficient.",J,L,ge='Check out the <a href="../../../using-diffusers/text-img2vid">Text or image-to-video</a> guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.',Q,V,Y,l,S,se,H,he="Pipeline to generate video from an input image using Stable Video Diffusion.",re,k,be=`This model inherits from <a href="/docs/diffusers/pr_12509/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a>. Check the superclass documentation for the generic methods
	implemented for all pipelines (downloading, saving, running on a particular device, etc.).`,Z,C,X,d,T,le,E,ve="Output class for Stable Video Diffusion pipeline.",ee,M,te,O,ie;return _=new Le({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),$=new ae({props:{title:"Stable Video Diffusion",local:"stable-video-diffusion",headingTag:"h1"}}),D=new ae({props:{title:"Tips",local:"tips",headingTag:"h2"}}),V=new ae({props:{title:"StableVideoDiffusionPipeline",local:"diffusers.StableVideoDiffusionPipeline",headingTag:"h2"}}),S=new _e({props:{name:"class diffusers.StableVideoDiffusionPipeline",anchor:"diffusers.StableVideoDiffusionPipeline",parameters:[{name:"vae",val:": AutoencoderKLTemporalDecoder"},{name:"image_encoder",val:": CLIPVisionModelWithProjection"},{name:"unet",val:": UNetSpatioTemporalConditionModel"},{name:"scheduler",val:": EulerDiscreteScheduler"},{name:"feature_extractor",val:": CLIPImageProcessor"}],parametersDescription:[{anchor:"diffusers.StableVideoDiffusionPipeline.vae",description:`<strong>vae</strong> (<code>AutoencoderKLTemporalDecoder</code>) —
	Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.`,name:"vae"},{anchor:"diffusers.StableVideoDiffusionPipeline.image_encoder",description:`<strong>image_encoder</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/clip#transformers.CLIPVisionModelWithProjection" rel="nofollow">CLIPVisionModelWithProjection</a>) —
	Frozen CLIP image-encoder
	(<a href="https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K" rel="nofollow">laion/CLIP-ViT-H-14-laion2B-s32B-b79K</a>).`,name:"image_encoder"},{anchor:"diffusers.StableVideoDiffusionPipeline.unet",description:`<strong>unet</strong> (<code>UNetSpatioTemporalConditionModel</code>) —
	A <code>UNetSpatioTemporalConditionModel</code> to denoise the encoded image latents.`,name:"unet"},{anchor:"diffusers.StableVideoDiffusionPipeline.scheduler",description:`<strong>scheduler</strong> (<a href="/docs/diffusers/pr_12509/en/api/schedulers/euler#diffusers.EulerDiscreteScheduler">EulerDiscreteScheduler</a>) —
	A scheduler to be used in combination with <code>unet</code> to denoise the encoded image latents.`,name:"scheduler"},{anchor:"diffusers.StableVideoDiffusionPipeline.feature_extractor",description:`<strong>feature_extractor</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/clip#transformers.CLIPImageProcessor" rel="nofollow">CLIPImageProcessor</a>) —
	A <code>CLIPImageProcessor</code> to extract features from generated images.`,name:"feature_extractor"}],source:"https://github.com/huggingface/diffusers/blob/vr_12509/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py#L147"}}),C=new ae({props:{title:"StableVideoDiffusionPipelineOutput",local:"diffusers.pipelines.stable_video_diffusion.StableVideoDiffusionPipelineOutput",headingTag:"h2"}}),T=new _e({props:{name:"class diffusers.pipelines.stable_video_diffusion.StableVideoDiffusionPipelineOutput",anchor:"diffusers.pipelines.stable_video_diffusion.StableVideoDiffusionPipelineOutput",parameters:[{name:"frames",val:": typing.Union[typing.List[typing.List[PIL.Image.Image]], numpy.ndarray, torch.Tensor]"}],parametersDescription:[{anchor:"diffusers.pipelines.stable_video_diffusion.StableVideoDiffusionPipelineOutput.frames",description:`<strong>frames</strong> (<code>[List[List[PIL.Image.Image]]</code>, <code>np.ndarray</code>, <code>torch.Tensor</code>]) —
	List of denoised PIL images of length <code>batch_size</code> or numpy array or torch tensor of shape <code>(batch_size, num_frames, height, width, num_channels)</code>.`,name:"frames"}],source:"https://github.com/huggingface/diffusers/blob/vr_12509/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py#L134"}}),M=new Ve({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/stable_diffusion/svd.md"}}),{c(){f=s("meta"),U=o(),A=s("p"),q=o(),m(_.$$.fragment),B=o(),m($.$$.fragment),K=o(),P=s("p"),P.innerHTML=de,W=o(),y=s("p"),y.textContent=ue,R=o(),w=s("p"),w.innerHTML=pe,N=o(),p=s("blockquote"),p.innerHTML=me,F=o(),m(D.$$.fragment),G=o(),x=s("p"),x.innerHTML=ce,J=o(),L=s("p"),L.innerHTML=ge,Q=o(),m(V.$$.fragment),Y=o(),l=s("div"),m(S.$$.fragment),se=o(),H=s("p"),H.textContent=he,re=o(),k=s("p"),k.innerHTML=be,Z=o(),m(C.$$.fragment),X=o(),d=s("div"),m(T.$$.fragment),le=o(),E=s("p"),E.textContent=ve,ee=o(),m(M.$$.fragment),te=o(),O=s("p"),this.h()},l(e){const t=xe("svelte-u9bgzb",document.head);f=r(t,"META",{name:!0,content:!0}),t.forEach(i),U=a(e),A=r(e,"P",{}),oe(A).forEach(i),q=a(e),c(_.$$.fragment,e),B=a(e),c($.$$.fragment,e),K=a(e),P=r(e,"P",{"data-svelte-h":!0}),u(P)!=="svelte-oc62j3"&&(P.innerHTML=de),W=a(e),y=r(e,"P",{"data-svelte-h":!0}),u(y)!=="svelte-1cwsb16"&&(y.textContent=ue),R=a(e),w=r(e,"P",{"data-svelte-h":!0}),u(w)!=="svelte-13czz1p"&&(w.innerHTML=pe),N=a(e),p=r(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),u(p)!=="svelte-3wtjbb"&&(p.innerHTML=me),F=a(e),c(D.$$.fragment,e),G=a(e),x=r(e,"P",{"data-svelte-h":!0}),u(x)!=="svelte-op8wgu"&&(x.innerHTML=ce),J=a(e),L=r(e,"P",{"data-svelte-h":!0}),u(L)!=="svelte-12tw7t4"&&(L.innerHTML=ge),Q=a(e),c(V.$$.fragment,e),Y=a(e),l=r(e,"DIV",{class:!0});var z=oe(l);c(S.$$.fragment,z),se=a(z),H=r(z,"P",{"data-svelte-h":!0}),u(H)!=="svelte-ilygt8"&&(H.textContent=he),re=a(z),k=r(z,"P",{"data-svelte-h":!0}),u(k)!=="svelte-ezd1vz"&&(k.innerHTML=be),z.forEach(i),Z=a(e),c(C.$$.fragment,e),X=a(e),d=r(e,"DIV",{class:!0});var ne=oe(d);c(T.$$.fragment,ne),le=a(ne),E=r(ne,"P",{"data-svelte-h":!0}),u(E)!=="svelte-1xag8sx"&&(E.textContent=ve),ne.forEach(i),ee=a(e),c(M.$$.fragment,e),te=a(e),O=r(e,"P",{}),oe(O).forEach(i),this.h()},h(){j(f,"name","hf:doc:metadata"),j(f,"content",Ce),j(p,"class","tip"),j(l,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),j(d,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,t){I(document.head,f),n(e,U,t),n(e,A,t),n(e,q,t),g(_,e,t),n(e,B,t),g($,e,t),n(e,K,t),n(e,P,t),n(e,W,t),n(e,y,t),n(e,R,t),n(e,w,t),n(e,N,t),n(e,p,t),n(e,F,t),g(D,e,t),n(e,G,t),n(e,x,t),n(e,J,t),n(e,L,t),n(e,Q,t),g(V,e,t),n(e,Y,t),n(e,l,t),g(S,l,null),I(l,se),I(l,H),I(l,re),I(l,k),n(e,Z,t),g(C,e,t),n(e,X,t),n(e,d,t),g(T,d,null),I(d,le),I(d,E),n(e,ee,t),g(M,e,t),n(e,te,t),n(e,O,t),ie=!0},p:Pe,i(e){ie\|\|(h(_.$$.fragment,e),h($.$$.fragment,e),h(D.$$.fragment,e),h(V.$$.fragment,e),h(S.$$.fragment,e),h(C.$$.fragment,e),h(T.$$.fragment,e),h(M.$$.fragment,e),ie=!0)},o(e){b(_.$$.fragment,e),b($.$$.fragment,e),b(D.$$.fragment,e),b(V.$$.fragment,e),b(S.$$.fragment,e),b(C.$$.fragment,e),b(T.$$.fragment,e),b(M.$$.fragment,e),ie=!1},d(e){e&&(i(U),i(A),i(q),i(B),i(K),i(P),i(W),i(y),i(R),i(w),i(N),i(p),i(F),i(G),i(x),i(J),i(L),i(Q),i(Y),i(l),i(Z),i(X),i(d),i(ee),i(te),i(O)),i(f),v(_,e),v($,e),v(D,e),v(V,e),v(S),v(C,e),v(T),v(M,e)}}}const Ce='{"title":"Stable Video Diffusion","local":"stable-video-diffusion","sections":[{"title":"Tips","local":"tips","sections":[],"depth":2},{"title":"StableVideoDiffusionPipeline","local":"diffusers.StableVideoDiffusionPipeline","sections":[],"depth":2},{"title":"StableVideoDiffusionPipelineOutput","local":"diffusers.pipelines.stable_video_diffusion.StableVideoDiffusionPipelineOutput","sections":[],"depth":2}],"depth":1}';function Te(fe){return ye(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class ze extends we{constructor(f){super(),De(this,f,Te,Se,$e,{})}}export{ze as component};

Xet Storage Details

Size:: 11.5 kB
Xet hash:: 10c14369b361dce3832b0e15fa7a52985b0e6403d86024588215624d3f9291f9

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.