Buckets:
| import{s as fe,n as ue,o as _e}from"../chunks/scheduler.53228c21.js";import{S as pe,i as he,e as i,s as a,c,h as ge,a as d,d as t,b as r,f as F,g as f,j as H,k as I,l as u,m as o,n as _,t as p,o as h,p as g}from"../chunks/index.100fac89.js";import{C as Te}from"../chunks/CopyLLMTxtMenu.af3e1493.js";import{D as ae}from"../chunks/Docstring.147b33f1.js";import{C as ve}from"../chunks/CodeBlock.0adb3827.js";import{H as re,E as be}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.b5eefd91.js";function $e(se){let m,L,j,U,v,P,b,Z,$,ie='A Diffusion Transformer model for 3D video-like data was introduced in <a href="https://github.com/Wan-Video/Wan2.2" rel="nofollow">Wan Animate</a> by the Alibaba Wan Team.',B,M,de="The model can be loaded with the following code snippet.",V,x,O,W,S,s,D,ee,N,me="A Transformer model for video-like data used in the WanAnimate model.",ne,T,w,te,z,le="Forward pass of Wan2.2-Animate transformer model.",G,y,R,l,A,oe,C,ce='The output of <a href="/docs/diffusers/pr_13751/en/api/models/transformer2d#diffusers.Transformer2DModel">Transformer2DModel</a>.',J,k,X,q,Y;return v=new Te({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),b=new re({props:{title:"WanAnimateTransformer3DModel",local:"wananimatetransformer3dmodel",headingTag:"h1"}}),x=new ve({props:{code:"ZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMFdhbkFuaW1hdGVUcmFuc2Zvcm1lcjNETW9kZWwlMEElMEF0cmFuc2Zvcm1lciUyMCUzRCUyMFdhbkFuaW1hdGVUcmFuc2Zvcm1lcjNETW9kZWwuZnJvbV9wcmV0cmFpbmVkKCUyMldhbi1BSSUyRldhbjIuMi1BbmltYXRlLTE0Qi1EaWZmdXNlcnMlMjIlMkMlMjBzdWJmb2xkZXIlM0QlMjJ0cmFuc2Zvcm1lciUyMiUyQyUyMHRvcmNoX2R0eXBlJTNEdG9yY2guYmZsb2F0MTYp",highlighted:`<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> WanAnimateTransformer3DModel | |
| transformer = WanAnimateTransformer3DModel.from_pretrained(<span class="hljs-string">"Wan-AI/Wan2.2-Animate-14B-Diffusers"</span>, subfolder=<span class="hljs-string">"transformer"</span>, torch_dtype=torch.bfloat16)`,lang:"python",wrap:!1}}),W=new re({props:{title:"WanAnimateTransformer3DModel",local:"diffusers.WanAnimateTransformer3DModel",headingTag:"h2"}}),D=new ae({props:{name:"class diffusers.WanAnimateTransformer3DModel",anchor:"diffusers.WanAnimateTransformer3DModel",parameters:[{name:"patch_size",val:": tuple = (1, 2, 2)"},{name:"num_attention_heads",val:": int = 40"},{name:"attention_head_dim",val:": int = 128"},{name:"in_channels",val:": int | None = 36"},{name:"latent_channels",val:": int | None = 16"},{name:"out_channels",val:": int | None = 16"},{name:"text_dim",val:": int = 4096"},{name:"freq_dim",val:": int = 256"},{name:"ffn_dim",val:": int = 13824"},{name:"num_layers",val:": int = 40"},{name:"cross_attn_norm",val:": bool = True"},{name:"qk_norm",val:": str | None = 'rms_norm_across_heads'"},{name:"eps",val:": float = 1e-06"},{name:"image_dim",val:": int | None = 1280"},{name:"added_kv_proj_dim",val:": int | None = None"},{name:"rope_max_seq_len",val:": int = 1024"},{name:"pos_embed_seq_len",val:": int | None = None"},{name:"motion_encoder_channel_sizes",val:": dict[str, int] | None = None"},{name:"motion_encoder_size",val:": int = 512"},{name:"motion_style_dim",val:": int = 512"},{name:"motion_dim",val:": int = 20"},{name:"motion_encoder_dim",val:": int = 512"},{name:"face_encoder_hidden_dim",val:": int = 1024"},{name:"face_encoder_num_heads",val:": int = 4"},{name:"inject_face_latents_blocks",val:": int = 5"},{name:"motion_encoder_batch_size",val:": int = 8"}],parametersDescription:[{anchor:"diffusers.WanAnimateTransformer3DModel.patch_size",description:`<strong>patch_size</strong> (<code>tuple[int]</code>, defaults to <code>(1, 2, 2)</code>) — | |
| 3D patch dimensions for video embedding (t_patch, h_patch, w_patch).`,name:"patch_size"},{anchor:"diffusers.WanAnimateTransformer3DModel.num_attention_heads",description:`<strong>num_attention_heads</strong> (<code>int</code>, defaults to <code>40</code>) — | |
| Fixed length for text embeddings.`,name:"num_attention_heads"},{anchor:"diffusers.WanAnimateTransformer3DModel.attention_head_dim",description:`<strong>attention_head_dim</strong> (<code>int</code>, defaults to <code>128</code>) — | |
| The number of channels in each head.`,name:"attention_head_dim"},{anchor:"diffusers.WanAnimateTransformer3DModel.in_channels",description:`<strong>in_channels</strong> (<code>int</code>, defaults to <code>16</code>) — | |
| The number of channels in the input.`,name:"in_channels"},{anchor:"diffusers.WanAnimateTransformer3DModel.out_channels",description:`<strong>out_channels</strong> (<code>int</code>, defaults to <code>16</code>) — | |
| The number of channels in the output.`,name:"out_channels"},{anchor:"diffusers.WanAnimateTransformer3DModel.text_dim",description:`<strong>text_dim</strong> (<code>int</code>, defaults to <code>512</code>) — | |
| Input dimension for text embeddings.`,name:"text_dim"},{anchor:"diffusers.WanAnimateTransformer3DModel.freq_dim",description:`<strong>freq_dim</strong> (<code>int</code>, defaults to <code>256</code>) — | |
| Dimension for sinusoidal time embeddings.`,name:"freq_dim"},{anchor:"diffusers.WanAnimateTransformer3DModel.ffn_dim",description:`<strong>ffn_dim</strong> (<code>int</code>, defaults to <code>13824</code>) — | |
| Intermediate dimension in feed-forward network.`,name:"ffn_dim"},{anchor:"diffusers.WanAnimateTransformer3DModel.num_layers",description:`<strong>num_layers</strong> (<code>int</code>, defaults to <code>40</code>) — | |
| The number of layers of transformer blocks to use.`,name:"num_layers"},{anchor:"diffusers.WanAnimateTransformer3DModel.window_size",description:`<strong>window_size</strong> (<code>tuple[int]</code>, defaults to <code>(-1, -1)</code>) — | |
| Window size for local attention (-1 indicates global attention).`,name:"window_size"},{anchor:"diffusers.WanAnimateTransformer3DModel.cross_attn_norm",description:`<strong>cross_attn_norm</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Enable cross-attention normalization.`,name:"cross_attn_norm"},{anchor:"diffusers.WanAnimateTransformer3DModel.qk_norm",description:`<strong>qk_norm</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Enable query/key normalization.`,name:"qk_norm"},{anchor:"diffusers.WanAnimateTransformer3DModel.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to <code>1e-6</code>) — | |
| Epsilon value for normalization layers.`,name:"eps"},{anchor:"diffusers.WanAnimateTransformer3DModel.image_dim",description:`<strong>image_dim</strong> (<code>int</code>, <em>optional</em>, defaults to <code>1280</code>) — | |
| The number of channels to use for the image embedding. If <code>None</code>, no projection is used.`,name:"image_dim"},{anchor:"diffusers.WanAnimateTransformer3DModel.added_kv_proj_dim",description:`<strong>added_kv_proj_dim</strong> (<code>int</code>, <em>optional</em>, defaults to <code>5120</code>) — | |
| The number of channels to use for the added key and value projections. If <code>None</code>, no projection is used.`,name:"added_kv_proj_dim"}],source:"https://github.com/huggingface/diffusers/blob/vr_13751/src/diffusers/models/transformers/transformer_wan_animate.py#L986"}}),w=new ae({props:{name:"forward",anchor:"diffusers.WanAnimateTransformer3DModel.forward",parameters:[{name:"hidden_states",val:": Tensor"},{name:"timestep",val:": LongTensor"},{name:"encoder_hidden_states",val:": Tensor"},{name:"encoder_hidden_states_image",val:": torch.Tensor | None = None"},{name:"pose_hidden_states",val:": torch.Tensor | None = None"},{name:"face_pixel_values",val:": torch.Tensor | None = None"},{name:"motion_encode_batch_size",val:": int | None = None"},{name:"return_dict",val:": bool = True"},{name:"attention_kwargs",val:": dict[str, typing.Any] | None = None"}],parametersDescription:[{anchor:"diffusers.WanAnimateTransformer3DModel.forward.hidden_states",description:`<strong>hidden_states</strong> (<code>torch.Tensor</code> of shape <code>(B, 2C + 4, T + 1, H, W)</code>) — | |
| Input noisy video latents of shape <code>(B, 2C + 4, T + 1, H, W)</code>, where B is the batch size, C is the | |
| number of latent channels (16 for Wan VAE), T is the number of latent frames in an inference segment, H | |
| is the latent height, and W is the latent width.`,name:"hidden_states"},{anchor:"diffusers.WanAnimateTransformer3DModel.forward.timestep",description:`<strong>timestep</strong> — (<code>torch.LongTensor</code>): | |
| The current timestep in the denoising loop.`,name:"timestep"},{anchor:"diffusers.WanAnimateTransformer3DModel.forward.encoder_hidden_states",description:`<strong>encoder_hidden_states</strong> (<code>torch.Tensor</code>) — | |
| Text embeddings from the text encoder (umT5 for Wan Animate).`,name:"encoder_hidden_states"},{anchor:"diffusers.WanAnimateTransformer3DModel.forward.encoder_hidden_states_image",description:`<strong>encoder_hidden_states_image</strong> (<code>torch.Tensor</code>) — | |
| CLIP visual features of the reference (character) image.`,name:"encoder_hidden_states_image"},{anchor:"diffusers.WanAnimateTransformer3DModel.forward.pose_hidden_states",description:`<strong>pose_hidden_states</strong> (<code>torch.Tensor</code> of shape <code>(B, C, T, H, W)</code>) — | |
| Pose video latents. TODO: description`,name:"pose_hidden_states"},{anchor:"diffusers.WanAnimateTransformer3DModel.forward.face_pixel_values",description:`<strong>face_pixel_values</strong> (<code>torch.Tensor</code> of shape <code>(B, C', S, H', W')</code>) — | |
| Face video in pixel space (not latent space). Typically C’ = 3 and H’ and W’ are the height/width of | |
| the face video in pixels. Here S is the inference segment length, usually set to 77.`,name:"face_pixel_values"},{anchor:"diffusers.WanAnimateTransformer3DModel.forward.motion_encode_batch_size",description:`<strong>motion_encode_batch_size</strong> (<code>int</code>, <em>optional</em>) — | |
| The batch size for batched encoding of the face video via the motion encoder. Will default to | |
| <code>self.config.motion_encoder_batch_size</code> if not set.`,name:"motion_encode_batch_size"},{anchor:"diffusers.WanAnimateTransformer3DModel.forward.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) — | |
| Whether to return the output as a dict or tuple.`,name:"return_dict"}],source:"https://github.com/huggingface/diffusers/blob/vr_13751/src/diffusers/models/transformers/transformer_wan_animate.py#L1154"}}),y=new re({props:{title:"Transformer2DModelOutput",local:"diffusers.models.modeling_outputs.Transformer2DModelOutput",headingTag:"h2"}}),A=new ae({props:{name:"class diffusers.models.modeling_outputs.Transformer2DModelOutput",anchor:"diffusers.models.modeling_outputs.Transformer2DModelOutput",parameters:[{name:"sample",val:": torch.Tensor"}],parametersDescription:[{anchor:"diffusers.models.modeling_outputs.Transformer2DModelOutput.sample",description:`<strong>sample</strong> (<code>torch.Tensor</code> of shape <code>(batch_size, num_channels, height, width)</code> or <code>(batch size, num_vector_embeds - 1, num_latent_pixels)</code> if <a href="/docs/diffusers/pr_13751/en/api/models/transformer2d#diffusers.Transformer2DModel">Transformer2DModel</a> is discrete) — | |
| The hidden states output conditioned on the <code>encoder_hidden_states</code> input. If discrete, returns probability | |
| distributions for the unnoised latent pixels.`,name:"sample"}],source:"https://github.com/huggingface/diffusers/blob/vr_13751/src/diffusers/models/modeling_outputs.py#L21"}}),k=new be({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/models/wan_animate_transformer_3d.md"}}),{c(){m=i("meta"),L=a(),j=i("p"),U=a(),c(v.$$.fragment),P=a(),c(b.$$.fragment),Z=a(),$=i("p"),$.innerHTML=ie,B=a(),M=i("p"),M.textContent=de,V=a(),c(x.$$.fragment),O=a(),c(W.$$.fragment),S=a(),s=i("div"),c(D.$$.fragment),ee=a(),N=i("p"),N.textContent=me,ne=a(),T=i("div"),c(w.$$.fragment),te=a(),z=i("p"),z.textContent=le,G=a(),c(y.$$.fragment),R=a(),l=i("div"),c(A.$$.fragment),oe=a(),C=i("p"),C.innerHTML=ce,J=a(),c(k.$$.fragment),X=a(),q=i("p"),this.h()},l(e){const n=ge("svelte-u9bgzb",document.head);m=d(n,"META",{name:!0,content:!0}),n.forEach(t),L=r(e),j=d(e,"P",{}),F(j).forEach(t),U=r(e),f(v.$$.fragment,e),P=r(e),f(b.$$.fragment,e),Z=r(e),$=d(e,"P",{"data-svelte-h":!0}),H($)!=="svelte-oai9e2"&&($.innerHTML=ie),B=r(e),M=d(e,"P",{"data-svelte-h":!0}),H(M)!=="svelte-1vuni30"&&(M.textContent=de),V=r(e),f(x.$$.fragment,e),O=r(e),f(W.$$.fragment,e),S=r(e),s=d(e,"DIV",{class:!0});var E=F(s);f(D.$$.fragment,E),ee=r(E),N=d(E,"P",{"data-svelte-h":!0}),H(N)!=="svelte-pru3yc"&&(N.textContent=me),ne=r(E),T=d(E,"DIV",{class:!0});var Q=F(T);f(w.$$.fragment,Q),te=r(Q),z=d(Q,"P",{"data-svelte-h":!0}),H(z)!=="svelte-1dmxytt"&&(z.textContent=le),Q.forEach(t),E.forEach(t),G=r(e),f(y.$$.fragment,e),R=r(e),l=d(e,"DIV",{class:!0});var K=F(l);f(A.$$.fragment,K),oe=r(K),C=d(K,"P",{"data-svelte-h":!0}),H(C)!=="svelte-1acihvv"&&(C.innerHTML=ce),K.forEach(t),J=r(e),f(k.$$.fragment,e),X=r(e),q=d(e,"P",{}),F(q).forEach(t),this.h()},h(){I(m,"name","hf:doc:metadata"),I(m,"content",Me),I(T,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),I(s,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),I(l,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,n){u(document.head,m),o(e,L,n),o(e,j,n),o(e,U,n),_(v,e,n),o(e,P,n),_(b,e,n),o(e,Z,n),o(e,$,n),o(e,B,n),o(e,M,n),o(e,V,n),_(x,e,n),o(e,O,n),_(W,e,n),o(e,S,n),o(e,s,n),_(D,s,null),u(s,ee),u(s,N),u(s,ne),u(s,T),_(w,T,null),u(T,te),u(T,z),o(e,G,n),_(y,e,n),o(e,R,n),o(e,l,n),_(A,l,null),u(l,oe),u(l,C),o(e,J,n),_(k,e,n),o(e,X,n),o(e,q,n),Y=!0},p:ue,i(e){Y||(p(v.$$.fragment,e),p(b.$$.fragment,e),p(x.$$.fragment,e),p(W.$$.fragment,e),p(D.$$.fragment,e),p(w.$$.fragment,e),p(y.$$.fragment,e),p(A.$$.fragment,e),p(k.$$.fragment,e),Y=!0)},o(e){h(v.$$.fragment,e),h(b.$$.fragment,e),h(x.$$.fragment,e),h(W.$$.fragment,e),h(D.$$.fragment,e),h(w.$$.fragment,e),h(y.$$.fragment,e),h(A.$$.fragment,e),h(k.$$.fragment,e),Y=!1},d(e){e&&(t(L),t(j),t(U),t(P),t(Z),t($),t(B),t(M),t(V),t(O),t(S),t(s),t(G),t(R),t(l),t(J),t(X),t(q)),t(m),g(v,e),g(b,e),g(x,e),g(W,e),g(D),g(w),g(y,e),g(A),g(k,e)}}}const Me='{"title":"WanAnimateTransformer3DModel","local":"wananimatetransformer3dmodel","sections":[{"title":"WanAnimateTransformer3DModel","local":"diffusers.WanAnimateTransformer3DModel","sections":[],"depth":2},{"title":"Transformer2DModelOutput","local":"diffusers.models.modeling_outputs.Transformer2DModelOutput","sections":[],"depth":2}],"depth":1}';function xe(se){return _e(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ne extends pe{constructor(m){super(),he(this,m,xe,$e,fe,{})}}export{Ne as component}; | |
Xet Storage Details
- Size:
- 15 kB
- Xet hash:
- 3e603d7326d067ff6e3ed05ffd3d8ec3cea64661bad275ac865d8922e70c7e82
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.