Buckets:
| import{s as Zt,o as Gt,n as Ae}from"../chunks/scheduler.53228c21.js";import{S as It,i as Bt,e as c,s as l,c as f,h as Vt,a as m,d as n,b as r,f as W,g as M,j as T,k as I,l as g,m as a,n as _,t as y,o as b,p as w,q as Wt,r as xt}from"../chunks/index.100fac89.js";import{D as ce}from"../chunks/Docstring.f8721f67.js";import{C as Je}from"../chunks/CodeBlock.d30a6509.js";import{E as Ut}from"../chunks/ExampleCodeBlock.24511344.js";import{H as be,E as Ct}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.d8195636.js";import{H as kt,a as jt}from"../chunks/HfOption.fad27e59.js";function St(U){let s,h="The example below demonstrates how to use the text-to-video pipeline to generate a video using a text description.",u,i,p;return i=new Je({props:{code:"cGlwZSUyMCUzRCUyMFNhbmFWaWRlb1BpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJFZmZpY2llbnQtTGFyZ2UtTW9kZWwlMkZTQU5BLVZpZGVvXzJCXzQ4MHBfZGlmZnVzZXJzJTIyJTJDJTIwJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiUyQyUwQSklMEFwaXBlLnRleHRfZW5jb2Rlci50byh0b3JjaC5iZmxvYXQxNiklMEFwaXBlLnZhZS50byh0b3JjaC5mbG9hdDMyKSUwQXBpcGUudG8oJTIyY3VkYSUyMiklMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJBJTIwY2F0JTIwYW5kJTIwYSUyMGRvZyUyMGJha2luZyUyMGElMjBjYWtlJTIwdG9nZXRoZXIlMjBpbiUyMGElMjBraXRjaGVuLiUyMFRoZSUyMGNhdCUyMGlzJTIwY2FyZWZ1bGx5JTIwbWVhc3VyaW5nJTIwZmxvdXIlMkMlMjB3aGlsZSUyMHRoZSUyMGRvZyUyMGlzJTIwc3RpcnJpbmclMjB0aGUlMjBiYXR0ZXIlMjB3aXRoJTIwYSUyMHdvb2RlbiUyMHNwb29uLiUyMFRoZSUyMGtpdGNoZW4lMjBpcyUyMGNvenklMkMlMjB3aXRoJTIwc3VubGlnaHQlMjBzdHJlYW1pbmclMjB0aHJvdWdoJTIwdGhlJTIwd2luZG93LiUyMiUwQW5lZ2F0aXZlX3Byb21wdCUyMCUzRCUyMCUyMkElMjBjaGFvdGljJTIwc2VxdWVuY2UlMjB3aXRoJTIwbWlzc2hhcGVuJTJDJTIwZGVmb3JtZWQlMjBsaW1icyUyMGluJTIwaGVhdnklMjBtb3Rpb24lMjBibHVyJTJDJTIwc3VkZGVuJTIwZGlzYXBwZWFyYW5jZSUyQyUyMGp1bXAlMjBjdXRzJTJDJTIwamVya3klMjBtb3ZlbWVudHMlMkMlMjByYXBpZCUyMHNob3QlMjBjaGFuZ2VzJTJDJTIwZnJhbWVzJTIwb3V0JTIwb2YlMjBzeW5jJTJDJTIwaW5jb25zaXN0ZW50JTIwY2hhcmFjdGVyJTIwc2hhcGVzJTJDJTIwdGVtcG9yYWwlMjBhcnRpZmFjdHMlMkMlMjBqaXR0ZXIlMkMlMjBhbmQlMjBnaG9zdGluZyUyMGVmZmVjdHMlMkMlMjBjcmVhdGluZyUyMGElMjBkaXNvcmllbnRpbmclMjB2aXN1YWwlMjBleHBlcmllbmNlLiUyMiUwQW1vdGlvbl9zY2FsZSUyMCUzRCUyMDMwJTBBbW90aW9uX3Byb21wdCUyMCUzRCUyMGYlMjIlMjBtb3Rpb24lMjBzY29yZSUzQSUyMCU3Qm1vdGlvbl9zY2FsZSU3RC4lMjIlMEFwcm9tcHQlMjAlM0QlMjBwcm9tcHQlMjAlMkIlMjBtb3Rpb25fcHJvbXB0JTBBJTBBdmlkZW8lMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMHByb21wdCUzRHByb21wdCUyQyUwQSUyMCUyMCUyMCUyMG5lZ2F0aXZlX3Byb21wdCUzRG5lZ2F0aXZlX3Byb21wdCUyQyUwQSUyMCUyMCUyMCUyMGhlaWdodCUzRDQ4MCUyQyUwQSUyMCUyMCUyMCUyMHdpZHRoJTNEODMyJTJDJTBBJTIwJTIwJTIwJTIwZnJhbWVzJTNEODElMkMlMEElMjAlMjAlMjAlMjBndWlkYW5jZV9zY2FsZSUzRDYlMkMlMEElMjAlMjAlMjAlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNENTAlMkMlMEElMjAlMjAlMjAlMjBnZW5lcmF0b3IlM0R0b3JjaC5HZW5lcmF0b3IoZGV2aWNlJTNEJTIyY3VkYSUyMikubWFudWFsX3NlZWQoMCklMkMlMEEpLmZyYW1lcyU1QjAlNUQlMEElMEFleHBvcnRfdG9fdmlkZW8odmlkZW8lMkMlMjAlMjJzYW5hX3ZpZGVvLm1wNCUyMiUyQyUyMGZwcyUzRDE2KQ==",highlighted:`pipe = SanaVideoPipeline.from_pretrained( | |
| <span class="hljs-string">"Efficient-Large-Model/SANA-Video_2B_480p_diffusers"</span>, | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| pipe.text_encoder.to(torch.bfloat16) | |
| pipe.vae.to(torch.float32) | |
| pipe.to(<span class="hljs-string">"cuda"</span>) | |
| prompt = <span class="hljs-string">"A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."</span> | |
| negative_prompt = <span class="hljs-string">"A chaotic sequence with misshapen, deformed limbs in heavy motion blur, sudden disappearance, jump cuts, jerky movements, rapid shot changes, frames out of sync, inconsistent character shapes, temporal artifacts, jitter, and ghosting effects, creating a disorienting visual experience."</span> | |
| motion_scale = <span class="hljs-number">30</span> | |
| motion_prompt = <span class="hljs-string">f" motion score: <span class="hljs-subst">{motion_scale}</span>."</span> | |
| prompt = prompt + motion_prompt | |
| video = pipe( | |
| prompt=prompt, | |
| negative_prompt=negative_prompt, | |
| height=<span class="hljs-number">480</span>, | |
| width=<span class="hljs-number">832</span>, | |
| frames=<span class="hljs-number">81</span>, | |
| guidance_scale=<span class="hljs-number">6</span>, | |
| num_inference_steps=<span class="hljs-number">50</span>, | |
| generator=torch.Generator(device=<span class="hljs-string">"cuda"</span>).manual_seed(<span class="hljs-number">0</span>), | |
| ).frames[<span class="hljs-number">0</span>] | |
| export_to_video(video, <span class="hljs-string">"sana_video.mp4"</span>, fps=<span class="hljs-number">16</span>)`,wrap:!1}}),{c(){s=c("p"),s.textContent=h,u=l(),f(i.$$.fragment)},l(t){s=m(t,"P",{"data-svelte-h":!0}),T(s)!=="svelte-8nckyn"&&(s.textContent=h),u=r(t),M(i.$$.fragment,t)},m(t,d){a(t,s,d),a(t,u,d),_(i,t,d),p=!0},p:Ae,i(t){p||(y(i.$$.fragment,t),p=!0)},o(t){b(i.$$.fragment,t),p=!1},d(t){t&&(n(s),n(u)),w(i,t)}}}function Nt(U){let s,h="The example below demonstrates how to use the image-to-video pipeline to generate a video using a text description and a starting frame.",u,i,p;return i=new Je({props:{code:"cGlwZSUyMCUzRCUyMFNhbmFJbWFnZVRvVmlkZW9QaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyRWZmaWNpZW50LUxhcmdlLU1vZGVsJTJGU0FOQS1WaWRlb18yQl80ODBwX2RpZmZ1c2VycyUyMiUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guYmZsb2F0MTYlMkMlMEEpJTBBcGlwZS5zY2hlZHVsZXIlMjAlM0QlMjBGbG93TWF0Y2hFdWxlckRpc2NyZXRlU2NoZWR1bGVyLmZyb21fY29uZmlnKHBpcGUuc2NoZWR1bGVyLmNvbmZpZyUyQyUyMGZsb3dfc2hpZnQlM0Q4LjApJTBBcGlwZS52YWUudG8odG9yY2guZmxvYXQzMiklMEFwaXBlLnRleHRfZW5jb2Rlci50byh0b3JjaC5iZmxvYXQxNiklMEFwaXBlLnRvKCUyMmN1ZGElMjIpJTBBJTBBaW1hZ2UlMjAlM0QlMjBsb2FkX2ltYWdlKCUyMmh0dHBzJTNBJTJGJTJGcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSUyRk5WbGFicyUyRlNhbmElMkZyZWZzJTJGaGVhZHMlMkZtYWluJTJGYXNzZXQlMkZzYW1wbGVzJTJGaTJ2LTEucG5nJTIyKSUwQXByb21wdCUyMCUzRCUyMCUyMkElMjB3b21hbiUyMHN0YW5kcyUyMGFnYWluc3QlMjBhJTIwc3R1bm5pbmclMjBzdW5zZXQlMjBiYWNrZHJvcCUyQyUyMGhlciUyMGxvbmclMkMlMjB3YXZ5JTIwYnJvd24lMjBoYWlyJTIwZ2VudGx5JTIwYmxvd2luZyUyMGluJTIwdGhlJTIwYnJlZXplLiUyMFNoZSUyMHdlYXJzJTIwYSUyMHNsZWV2ZWxlc3MlMkMlMjBsaWdodC1jb2xvcmVkJTIwYmxvdXNlJTIwd2l0aCUyMGElMjBkZWVwJTIwVi1uZWNrbGluZSUyQyUyMHdoaWNoJTIwYWNjZW50dWF0ZXMlMjBoZXIlMjBncmFjZWZ1bCUyMHBvc3R1cmUuJTIwVGhlJTIwd2FybSUyMGh1ZXMlMjBvZiUyMHRoZSUyMHNldHRpbmclMjBzdW4lMjBjYXN0JTIwYSUyMGdvbGRlbiUyMGdsb3clMjBhY3Jvc3MlMjBoZXIlMjBmYWNlJTIwYW5kJTIwaGFpciUyQyUyMGNyZWF0aW5nJTIwYSUyMHNlcmVuZSUyMGFuZCUyMGV0aGVyZWFsJTIwYXRtb3NwaGVyZS4lMjBUaGUlMjBiYWNrZ3JvdW5kJTIwZmVhdHVyZXMlMjBhJTIwYmx1cnJlZCUyMGxhbmRzY2FwZSUyMHdpdGglMjBzb2Z0JTJDJTIwcm9sbGluZyUyMGhpbGxzJTIwYW5kJTIwc2NhdHRlcmVkJTIwY2xvdWRzJTJDJTIwYWRkaW5nJTIwZGVwdGglMjB0byUyMHRoZSUyMHNjZW5lLiUyMFRoZSUyMGNhbWVyYSUyMHJlbWFpbnMlMjBzdGVhZHklMkMlMjBjYXB0dXJpbmclMjB0aGUlMjB0cmFucXVpbCUyMG1vbWVudCUyMGZyb20lMjBhJTIwbWVkaXVtJTIwY2xvc2UtdXAlMjBhbmdsZS4lMjIlMEFuZWdhdGl2ZV9wcm9tcHQlMjAlM0QlMjAlMjJBJTIwY2hhb3RpYyUyMHNlcXVlbmNlJTIwd2l0aCUyMG1pc3NoYXBlbiUyQyUyMGRlZm9ybWVkJTIwbGltYnMlMjBpbiUyMGhlYXZ5JTIwbW90aW9uJTIwYmx1ciUyQyUyMHN1ZGRlbiUyMGRpc2FwcGVhcmFuY2UlMkMlMjBqdW1wJTIwY3V0cyUyQyUyMGplcmt5JTIwbW92ZW1lbnRzJTJDJTIwcmFwaWQlMjBzaG90JTIwY2hhbmdlcyUyQyUyMGZyYW1lcyUyMG91dCUyMG9mJTIwc3luYyUyQyUyMGluY29uc2lzdGVudCUyMGNoYXJhY3RlciUyMHNoYXBlcyUyQyUyMHRlbXBvcmFsJTIwYXJ0aWZhY3RzJTJDJTIwaml0dGVyJTJDJTIwYW5kJTIwZ2hvc3RpbmclMjBlZmZlY3RzJTJDJTIwY3JlYXRpbmclMjBhJTIwZGlzb3JpZW50aW5nJTIwdmlzdWFsJTIwZXhwZXJpZW5jZS4lMjIlMEFtb3Rpb25fc2NhbGUlMjAlM0QlMjAzMCUwQW1vdGlvbl9wcm9tcHQlMjAlM0QlMjBmJTIyJTIwbW90aW9uJTIwc2NvcmUlM0ElMjAlN0Jtb3Rpb25fc2NhbGUlN0QuJTIyJTBBcHJvbXB0JTIwJTNEJTIwcHJvbXB0JTIwJTJCJTIwbW90aW9uX3Byb21wdCUwQSUwQW1vdGlvbl9zY2FsZSUyMCUzRCUyMDMwLjAlMEElMEF2aWRlbyUyMCUzRCUyMHBpcGUoJTBBJTIwJTIwJTIwJTIwaW1hZ2UlM0RpbWFnZSUyQyUwQSUyMCUyMCUyMCUyMHByb21wdCUzRHByb21wdCUyQyUwQSUyMCUyMCUyMCUyMG5lZ2F0aXZlX3Byb21wdCUzRG5lZ2F0aXZlX3Byb21wdCUyQyUwQSUyMCUyMCUyMCUyMGhlaWdodCUzRDQ4MCUyQyUwQSUyMCUyMCUyMCUyMHdpZHRoJTNEODMyJTJDJTBBJTIwJTIwJTIwJTIwZnJhbWVzJTNEODElMkMlMEElMjAlMjAlMjAlMjBndWlkYW5jZV9zY2FsZSUzRDYlMkMlMEElMjAlMjAlMjAlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNENTAlMkMlMEElMjAlMjAlMjAlMjBnZW5lcmF0b3IlM0R0b3JjaC5HZW5lcmF0b3IoZGV2aWNlJTNEJTIyY3VkYSUyMikubWFudWFsX3NlZWQoMCklMkMlMEEpLmZyYW1lcyU1QjAlNUQlMEElMEFleHBvcnRfdG9fdmlkZW8odmlkZW8lMkMlMjAlMjJzYW5hLWkydi5tcDQlMjIlMkMlMjBmcHMlM0QxNik=",highlighted:`pipe = SanaImageToVideoPipeline.from_pretrained( | |
| <span class="hljs-string">"Efficient-Large-Model/SANA-Video_2B_480p_diffusers"</span>, | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| pipe.scheduler = FlowMatchEulerDiscreteScheduler.from_config(pipe.scheduler.config, flow_shift=<span class="hljs-number">8.0</span>) | |
| pipe.vae.to(torch.float32) | |
| pipe.text_encoder.to(torch.bfloat16) | |
| pipe.to(<span class="hljs-string">"cuda"</span>) | |
| image = load_image(<span class="hljs-string">"https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/main/asset/samples/i2v-1.png"</span>) | |
| prompt = <span class="hljs-string">"A woman stands against a stunning sunset backdrop, her long, wavy brown hair gently blowing in the breeze. She wears a sleeveless, light-colored blouse with a deep V-neckline, which accentuates her graceful posture. The warm hues of the setting sun cast a golden glow across her face and hair, creating a serene and ethereal atmosphere. The background features a blurred landscape with soft, rolling hills and scattered clouds, adding depth to the scene. The camera remains steady, capturing the tranquil moment from a medium close-up angle."</span> | |
| negative_prompt = <span class="hljs-string">"A chaotic sequence with misshapen, deformed limbs in heavy motion blur, sudden disappearance, jump cuts, jerky movements, rapid shot changes, frames out of sync, inconsistent character shapes, temporal artifacts, jitter, and ghosting effects, creating a disorienting visual experience."</span> | |
| motion_scale = <span class="hljs-number">30</span> | |
| motion_prompt = <span class="hljs-string">f" motion score: <span class="hljs-subst">{motion_scale}</span>."</span> | |
| prompt = prompt + motion_prompt | |
| motion_scale = <span class="hljs-number">30.0</span> | |
| video = pipe( | |
| image=image, | |
| prompt=prompt, | |
| negative_prompt=negative_prompt, | |
| height=<span class="hljs-number">480</span>, | |
| width=<span class="hljs-number">832</span>, | |
| frames=<span class="hljs-number">81</span>, | |
| guidance_scale=<span class="hljs-number">6</span>, | |
| num_inference_steps=<span class="hljs-number">50</span>, | |
| generator=torch.Generator(device=<span class="hljs-string">"cuda"</span>).manual_seed(<span class="hljs-number">0</span>), | |
| ).frames[<span class="hljs-number">0</span>] | |
| export_to_video(video, <span class="hljs-string">"sana-i2v.mp4"</span>, fps=<span class="hljs-number">16</span>)`,wrap:!1}}),{c(){s=c("p"),s.textContent=h,u=l(),f(i.$$.fragment)},l(t){s=m(t,"P",{"data-svelte-h":!0}),T(s)!=="svelte-1nn8bjg"&&(s.textContent=h),u=r(t),M(i.$$.fragment,t)},m(t,d){a(t,s,d),a(t,u,d),_(i,t,d),p=!0},p:Ae,i(t){p||(y(i.$$.fragment,t),p=!0)},o(t){b(i.$$.fragment,t),p=!1},d(t){t&&(n(s),n(u)),w(i,t)}}}function Xt(U){let s,h,u,i,p;return h=new jt({props:{id:"generation pipelines",option:"Text-to-Video",$$slots:{default:[St]},$$scope:{ctx:U}}}),i=new jt({props:{id:"generation pipelines",option:"Image-to-Video",$$slots:{default:[Nt]},$$scope:{ctx:U}}}),{c(){s=Wt("`\n"),f(h.$$.fragment),u=l(),f(i.$$.fragment)},l(t){s=xt(t,"`\n"),M(h.$$.fragment,t),u=r(t),M(i.$$.fragment,t)},m(t,d){a(t,s,d),_(h,t,d),a(t,u,d),_(i,t,d),p=!0},p(t,d){const we={};d&2&&(we.$$scope={dirty:d,ctx:t}),h.$set(we);const Y={};d&2&&(Y.$$scope={dirty:d,ctx:t}),i.$set(Y)},i(t){p||(y(h.$$.fragment,t),y(i.$$.fragment,t),p=!0)},o(t){b(h.$$.fragment,t),b(i.$$.fragment,t),p=!1},d(t){t&&(n(s),n(u)),w(h,t),w(i,t)}}}function Yt(U){let s,h="Examples:",u,i,p;return i=new Je({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU2FuYVZpZGVvUGlwZWxpbmUlMEFmcm9tJTIwZGlmZnVzZXJzLnV0aWxzJTIwaW1wb3J0JTIwZXhwb3J0X3RvX3ZpZGVvJTBBJTBBcGlwZSUyMCUzRCUyMFNhbmFWaWRlb1BpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMjJFZmZpY2llbnQtTGFyZ2UtTW9kZWwlMkZTQU5BLVZpZGVvXzJCXzQ4MHBfZGlmZnVzZXJzJTIyKSUwQXBpcGUudHJhbnNmb3JtZXIudG8odG9yY2guYmZsb2F0MTYpJTBBcGlwZS50ZXh0X2VuY29kZXIudG8odG9yY2guYmZsb2F0MTYpJTBBcGlwZS52YWUudG8odG9yY2guZmxvYXQzMiklMEFwaXBlLnRvKCUyMmN1ZGElMjIpJTBBbW90aW9uX3Njb3JlJTIwJTNEJTIwMzAlMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJFdmVuaW5nJTJDJTIwYmFja2xpZ2h0JTJDJTIwc2lkZSUyMGxpZ2h0aW5nJTJDJTIwc29mdCUyMGxpZ2h0JTJDJTIwaGlnaCUyMGNvbnRyYXN0JTJDJTIwbWlkLXNob3QlMkMlMjBjZW50ZXJlZCUyMGNvbXBvc2l0aW9uJTJDJTIwY2xlYW4lMjBzb2xvJTIwc2hvdCUyQyUyMHdhcm0lMjBjb2xvci4lMjBBJTIweW91bmclMjBDYXVjYXNpYW4lMjBtYW4lMjBzdGFuZHMlMjBpbiUyMGElMjBmb3Jlc3QlMkMlMjBnb2xkZW4lMjBsaWdodCUyMGdsaW1tZXJzJTIwb24lMjBoaXMlMjBoYWlyJTIwYXMlMjBzdW5saWdodCUyMGZpbHRlcnMlMjB0aHJvdWdoJTIwdGhlJTIwbGVhdmVzLiUyMEhlJTIwd2VhcnMlMjBhJTIwbGlnaHQlMjBzaGlydCUyQyUyMHdpbmQlMjBnZW50bHklMjBibG93aW5nJTIwaGlzJTIwaGFpciUyMGFuZCUyMGNvbGxhciUyQyUyMGxpZ2h0JTIwZGFuY2VzJTIwYWNyb3NzJTIwaGlzJTIwZmFjZSUyMHdpdGglMjBoaXMlMjBtb3ZlbWVudHMuJTIwVGhlJTIwYmFja2dyb3VuZCUyMGlzJTIwYmx1cnJlZCUyQyUyMHdpdGglMjBkYXBwbGVkJTIwbGlnaHQlMjBhbmQlMjBzb2Z0JTIwdHJlZSUyMHNoYWRvd3MlMjBpbiUyMHRoZSUyMGRpc3RhbmNlLiUyMFRoZSUyMGNhbWVyYSUyMGZvY3VzZXMlMjBvbiUyMGhpcyUyMGxpZnRlZCUyMGdhemUlMkMlMjBjbGVhciUyMGFuZCUyMGVtb3Rpb25hbC4lMjIlMEFuZWdhdGl2ZV9wcm9tcHQlMjAlM0QlMjAlMjJBJTIwY2hhb3RpYyUyMHNlcXVlbmNlJTIwd2l0aCUyMG1pc3NoYXBlbiUyQyUyMGRlZm9ybWVkJTIwbGltYnMlMjBpbiUyMGhlYXZ5JTIwbW90aW9uJTIwYmx1ciUyQyUyMHN1ZGRlbiUyMGRpc2FwcGVhcmFuY2UlMkMlMjBqdW1wJTIwY3V0cyUyQyUyMGplcmt5JTIwbW92ZW1lbnRzJTJDJTIwcmFwaWQlMjBzaG90JTIwY2hhbmdlcyUyQyUyMGZyYW1lcyUyMG91dCUyMG9mJTIwc3luYyUyQyUyMGluY29uc2lzdGVudCUyMGNoYXJhY3RlciUyMHNoYXBlcyUyQyUyMHRlbXBvcmFsJTIwYXJ0aWZhY3RzJTJDJTIwaml0dGVyJTJDJTIwYW5kJTIwZ2hvc3RpbmclMjBlZmZlY3RzJTJDJTIwY3JlYXRpbmclMjBhJTIwZGlzb3JpZW50aW5nJTIwdmlzdWFsJTIwZXhwZXJpZW5jZS4lMjIlMEFtb3Rpb25fcHJvbXB0JTIwJTNEJTIwZiUyMiUyMG1vdGlvbiUyMHNjb3JlJTNBJTIwJTdCbW90aW9uX3Njb3JlJTdELiUyMiUwQXByb21wdCUyMCUzRCUyMHByb21wdCUyMCUyQiUyMG1vdGlvbl9wcm9tcHQlMEElMEFvdXRwdXQlMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMHByb21wdCUzRHByb21wdCUyQyUwQSUyMCUyMCUyMCUyMG5lZ2F0aXZlX3Byb21wdCUzRG5lZ2F0aXZlX3Byb21wdCUyQyUwQSUyMCUyMCUyMCUyMGhlaWdodCUzRDQ4MCUyQyUwQSUyMCUyMCUyMCUyMHdpZHRoJTNEODMyJTJDJTBBJTIwJTIwJTIwJTIwZnJhbWVzJTNEODElMkMlMEElMjAlMjAlMjAlMjBndWlkYW5jZV9zY2FsZSUzRDYlMkMlMEElMjAlMjAlMjAlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNENTAlMkMlMEElMjAlMjAlMjAlMjBnZW5lcmF0b3IlM0R0b3JjaC5HZW5lcmF0b3IoZGV2aWNlJTNEJTIyY3VkYSUyMikubWFudWFsX3NlZWQoNDIpJTJDJTBBKS5mcmFtZXMlNUIwJTVEJTBBJTBBZXhwb3J0X3RvX3ZpZGVvKG91dHB1dCUyQyUyMCUyMnNhbmEtdmlkZW8tb3V0cHV0Lm1wNCUyMiUyQyUyMGZwcyUzRDE2KQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> SanaVideoPipeline | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video | |
| <span class="hljs-meta">>>> </span>pipe = SanaVideoPipeline.from_pretrained(<span class="hljs-string">"Efficient-Large-Model/SANA-Video_2B_480p_diffusers"</span>) | |
| <span class="hljs-meta">>>> </span>pipe.transformer.to(torch.bfloat16) | |
| <span class="hljs-meta">>>> </span>pipe.text_encoder.to(torch.bfloat16) | |
| <span class="hljs-meta">>>> </span>pipe.vae.to(torch.float32) | |
| <span class="hljs-meta">>>> </span>pipe.to(<span class="hljs-string">"cuda"</span>) | |
| <span class="hljs-meta">>>> </span>motion_score = <span class="hljs-number">30</span> | |
| <span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"Evening, backlight, side lighting, soft light, high contrast, mid-shot, centered composition, clean solo shot, warm color. A young Caucasian man stands in a forest, golden light glimmers on his hair as sunlight filters through the leaves. He wears a light shirt, wind gently blowing his hair and collar, light dances across his face with his movements. The background is blurred, with dappled light and soft tree shadows in the distance. The camera focuses on his lifted gaze, clear and emotional."</span> | |
| <span class="hljs-meta">>>> </span>negative_prompt = <span class="hljs-string">"A chaotic sequence with misshapen, deformed limbs in heavy motion blur, sudden disappearance, jump cuts, jerky movements, rapid shot changes, frames out of sync, inconsistent character shapes, temporal artifacts, jitter, and ghosting effects, creating a disorienting visual experience."</span> | |
| <span class="hljs-meta">>>> </span>motion_prompt = <span class="hljs-string">f" motion score: <span class="hljs-subst">{motion_score}</span>."</span> | |
| <span class="hljs-meta">>>> </span>prompt = prompt + motion_prompt | |
| <span class="hljs-meta">>>> </span>output = pipe( | |
| <span class="hljs-meta">... </span> prompt=prompt, | |
| <span class="hljs-meta">... </span> negative_prompt=negative_prompt, | |
| <span class="hljs-meta">... </span> height=<span class="hljs-number">480</span>, | |
| <span class="hljs-meta">... </span> width=<span class="hljs-number">832</span>, | |
| <span class="hljs-meta">... </span> frames=<span class="hljs-number">81</span>, | |
| <span class="hljs-meta">... </span> guidance_scale=<span class="hljs-number">6</span>, | |
| <span class="hljs-meta">... </span> num_inference_steps=<span class="hljs-number">50</span>, | |
| <span class="hljs-meta">... </span> generator=torch.Generator(device=<span class="hljs-string">"cuda"</span>).manual_seed(<span class="hljs-number">42</span>), | |
| <span class="hljs-meta">... </span>).frames[<span class="hljs-number">0</span>] | |
| <span class="hljs-meta">>>> </span>export_to_video(output, <span class="hljs-string">"sana-video-output.mp4"</span>, fps=<span class="hljs-number">16</span>)`,wrap:!1}}),{c(){s=c("p"),s.textContent=h,u=l(),f(i.$$.fragment)},l(t){s=m(t,"P",{"data-svelte-h":!0}),T(s)!=="svelte-kvfsh7"&&(s.textContent=h),u=r(t),M(i.$$.fragment,t)},m(t,d){a(t,s,d),a(t,u,d),_(i,t,d),p=!0},p:Ae,i(t){p||(y(i.$$.fragment,t),p=!0)},o(t){b(i.$$.fragment,t),p=!1},d(t){t&&(n(s),n(u)),w(i,t)}}}function Qt(U){let s,h="Examples:",u,i,p;return i=new Je({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU2FuYUltYWdlVG9WaWRlb1BpcGVsaW5lJTBBZnJvbSUyMGRpZmZ1c2Vycy51dGlscyUyMGltcG9ydCUyMGV4cG9ydF90b192aWRlbyUyQyUyMGxvYWRfaW1hZ2UlMEElMEFwaXBlJTIwJTNEJTIwU2FuYUltYWdlVG9WaWRlb1BpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMjJFZmZpY2llbnQtTGFyZ2UtTW9kZWwlMkZTQU5BLVZpZGVvXzJCXzQ4MHBfZGlmZnVzZXJzJTIyKSUwQXBpcGUudHJhbnNmb3JtZXIudG8odG9yY2guYmZsb2F0MTYpJTBBcGlwZS50ZXh0X2VuY29kZXIudG8odG9yY2guYmZsb2F0MTYpJTBBcGlwZS52YWUudG8odG9yY2guZmxvYXQzMiklMEFwaXBlLnRvKCUyMmN1ZGElMjIpJTBBbW90aW9uX3Njb3JlJTIwJTNEJTIwMzAlMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJBJTIwd29tYW4lMjBzdGFuZHMlMjBhZ2FpbnN0JTIwYSUyMHN0dW5uaW5nJTIwc3Vuc2V0JTIwYmFja2Ryb3AlMkMlMjBoZXIlMjBsb25nJTJDJTIwd2F2eSUyMGJyb3duJTIwaGFpciUyMGdlbnRseSUyMGJsb3dpbmclMjBpbiUyMHRoZSUyMGJyZWV6ZS4lMjBTaGUlMjB3ZWFycyUyMGElMjBzbGVldmVsZXNzJTJDJTIwbGlnaHQtY29sb3JlZCUyMGJsb3VzZSUyMHdpdGglMjBhJTIwZGVlcCUyMFYtbmVja2xpbmUlMkMlMjB3aGljaCUyMGFjY2VudHVhdGVzJTIwaGVyJTIwZ3JhY2VmdWwlMjBwb3N0dXJlLiUyMFRoZSUyMHdhcm0lMjBodWVzJTIwb2YlMjB0aGUlMjBzZXR0aW5nJTIwc3VuJTIwY2FzdCUyMGElMjBnb2xkZW4lMjBnbG93JTIwYWNyb3NzJTIwaGVyJTIwZmFjZSUyMGFuZCUyMGhhaXIlMkMlMjBjcmVhdGluZyUyMGElMjBzZXJlbmUlMjBhbmQlMjBldGhlcmVhbCUyMGF0bW9zcGhlcmUuJTIwVGhlJTIwYmFja2dyb3VuZCUyMGZlYXR1cmVzJTIwYSUyMGJsdXJyZWQlMjBsYW5kc2NhcGUlMjB3aXRoJTIwc29mdCUyQyUyMHJvbGxpbmclMjBoaWxscyUyMGFuZCUyMHNjYXR0ZXJlZCUyMGNsb3VkcyUyQyUyMGFkZGluZyUyMGRlcHRoJTIwdG8lMjB0aGUlMjBzY2VuZS4lMjBUaGUlMjBjYW1lcmElMjByZW1haW5zJTIwc3RlYWR5JTJDJTIwY2FwdHVyaW5nJTIwdGhlJTIwdHJhbnF1aWwlMjBtb21lbnQlMjBmcm9tJTIwYSUyMG1lZGl1bSUyMGNsb3NlLXVwJTIwYW5nbGUuJTIyJTBBbmVnYXRpdmVfcHJvbXB0JTIwJTNEJTIwJTIyQSUyMGNoYW90aWMlMjBzZXF1ZW5jZSUyMHdpdGglMjBtaXNzaGFwZW4lMkMlMjBkZWZvcm1lZCUyMGxpbWJzJTIwaW4lMjBoZWF2eSUyMG1vdGlvbiUyMGJsdXIlMkMlMjBzdWRkZW4lMjBkaXNhcHBlYXJhbmNlJTJDJTIwanVtcCUyMGN1dHMlMkMlMjBqZXJreSUyMG1vdmVtZW50cyUyQyUyMHJhcGlkJTIwc2hvdCUyMGNoYW5nZXMlMkMlMjBmcmFtZXMlMjBvdXQlMjBvZiUyMHN5bmMlMkMlMjBpbmNvbnNpc3RlbnQlMjBjaGFyYWN0ZXIlMjBzaGFwZXMlMkMlMjB0ZW1wb3JhbCUyMGFydGlmYWN0cyUyQyUyMGppdHRlciUyQyUyMGFuZCUyMGdob3N0aW5nJTIwZWZmZWN0cyUyQyUyMGNyZWF0aW5nJTIwYSUyMGRpc29yaWVudGluZyUyMHZpc3VhbCUyMGV4cGVyaWVuY2UuJTIyJTBBbW90aW9uX3Byb21wdCUyMCUzRCUyMGYlMjIlMjBtb3Rpb24lMjBzY29yZSUzQSUyMCU3Qm1vdGlvbl9zY29yZSU3RC4lMjIlMEFwcm9tcHQlMjAlM0QlMjBwcm9tcHQlMjAlMkIlMjBtb3Rpb25fcHJvbXB0JTBBaW1hZ2UlMjAlM0QlMjBsb2FkX2ltYWdlKCUyMmh0dHBzJTNBJTJGJTJGcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSUyRk5WbGFicyUyRlNhbmElMkZyZWZzJTJGaGVhZHMlMkZtYWluJTJGYXNzZXQlMkZzYW1wbGVzJTJGaTJ2LTEucG5nJTIyKSUwQSUwQW91dHB1dCUyMCUzRCUyMHBpcGUoJTBBJTIwJTIwJTIwJTIwaW1hZ2UlM0RpbWFnZSUyQyUwQSUyMCUyMCUyMCUyMHByb21wdCUzRHByb21wdCUyQyUwQSUyMCUyMCUyMCUyMG5lZ2F0aXZlX3Byb21wdCUzRG5lZ2F0aXZlX3Byb21wdCUyQyUwQSUyMCUyMCUyMCUyMGhlaWdodCUzRDQ4MCUyQyUwQSUyMCUyMCUyMCUyMHdpZHRoJTNEODMyJTJDJTBBJTIwJTIwJTIwJTIwZnJhbWVzJTNEODElMkMlMEElMjAlMjAlMjAlMjBndWlkYW5jZV9zY2FsZSUzRDYlMkMlMEElMjAlMjAlMjAlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNENTAlMkMlMEElMjAlMjAlMjAlMjBnZW5lcmF0b3IlM0R0b3JjaC5HZW5lcmF0b3IoZGV2aWNlJTNEJTIyY3VkYSUyMikubWFudWFsX3NlZWQoNDIpJTJDJTBBKS5mcmFtZXMlNUIwJTVEJTBBJTBBZXhwb3J0X3RvX3ZpZGVvKG91dHB1dCUyQyUyMCUyMnNhbmEtdGkydi1vdXRwdXQubXA0JTIyJTJDJTIwZnBzJTNEMTYp",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> SanaImageToVideoPipeline | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video, load_image | |
| <span class="hljs-meta">>>> </span>pipe = SanaImageToVideoPipeline.from_pretrained(<span class="hljs-string">"Efficient-Large-Model/SANA-Video_2B_480p_diffusers"</span>) | |
| <span class="hljs-meta">>>> </span>pipe.transformer.to(torch.bfloat16) | |
| <span class="hljs-meta">>>> </span>pipe.text_encoder.to(torch.bfloat16) | |
| <span class="hljs-meta">>>> </span>pipe.vae.to(torch.float32) | |
| <span class="hljs-meta">>>> </span>pipe.to(<span class="hljs-string">"cuda"</span>) | |
| <span class="hljs-meta">>>> </span>motion_score = <span class="hljs-number">30</span> | |
| <span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"A woman stands against a stunning sunset backdrop, her long, wavy brown hair gently blowing in the breeze. She wears a sleeveless, light-colored blouse with a deep V-neckline, which accentuates her graceful posture. The warm hues of the setting sun cast a golden glow across her face and hair, creating a serene and ethereal atmosphere. The background features a blurred landscape with soft, rolling hills and scattered clouds, adding depth to the scene. The camera remains steady, capturing the tranquil moment from a medium close-up angle."</span> | |
| <span class="hljs-meta">>>> </span>negative_prompt = <span class="hljs-string">"A chaotic sequence with misshapen, deformed limbs in heavy motion blur, sudden disappearance, jump cuts, jerky movements, rapid shot changes, frames out of sync, inconsistent character shapes, temporal artifacts, jitter, and ghosting effects, creating a disorienting visual experience."</span> | |
| <span class="hljs-meta">>>> </span>motion_prompt = <span class="hljs-string">f" motion score: <span class="hljs-subst">{motion_score}</span>."</span> | |
| <span class="hljs-meta">>>> </span>prompt = prompt + motion_prompt | |
| <span class="hljs-meta">>>> </span>image = load_image(<span class="hljs-string">"https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/main/asset/samples/i2v-1.png"</span>) | |
| <span class="hljs-meta">>>> </span>output = pipe( | |
| <span class="hljs-meta">... </span> image=image, | |
| <span class="hljs-meta">... </span> prompt=prompt, | |
| <span class="hljs-meta">... </span> negative_prompt=negative_prompt, | |
| <span class="hljs-meta">... </span> height=<span class="hljs-number">480</span>, | |
| <span class="hljs-meta">... </span> width=<span class="hljs-number">832</span>, | |
| <span class="hljs-meta">... </span> frames=<span class="hljs-number">81</span>, | |
| <span class="hljs-meta">... </span> guidance_scale=<span class="hljs-number">6</span>, | |
| <span class="hljs-meta">... </span> num_inference_steps=<span class="hljs-number">50</span>, | |
| <span class="hljs-meta">... </span> generator=torch.Generator(device=<span class="hljs-string">"cuda"</span>).manual_seed(<span class="hljs-number">42</span>), | |
| <span class="hljs-meta">... </span>).frames[<span class="hljs-number">0</span>] | |
| <span class="hljs-meta">>>> </span>export_to_video(output, <span class="hljs-string">"sana-ti2v-output.mp4"</span>, fps=<span class="hljs-number">16</span>)`,wrap:!1}}),{c(){s=c("p"),s.textContent=h,u=l(),f(i.$$.fragment)},l(t){s=m(t,"P",{"data-svelte-h":!0}),T(s)!=="svelte-kvfsh7"&&(s.textContent=h),u=r(t),M(i.$$.fragment,t)},m(t,d){a(t,s,d),a(t,u,d),_(i,t,d),p=!0},p:Ae,i(t){p||(y(i.$$.fragment,t),p=!0)},o(t){b(i.$$.fragment,t),p=!1},d(t){t&&(n(s),n(u)),w(i,t)}}}function zt(U){let s,h,u,i,p,t,d,we='<img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/> <img alt="MPS" src="https://img.shields.io/badge/MPS-000000?style=flat&logo=apple&logoColor=white%22"/>',Y,Q,rt='<a href="https://huggingface.co/papers/2509.24695" rel="nofollow">SANA-Video: Efficient Video Generation with Block Linear Diffusion Transformer</a> from NVIDIA and MIT HAN Lab, by Junsong Chen, Yuyang Zhao, Jincheng Yu, Ruihang Chu, Junyu Chen, Shuai Yang, Xianbang Wang, Yicheng Pan, Daquan Zhou, Huan Ling, Haozhe Liu, Hongwei Yi, Hao Zhang, Muyang Li, Yukang Chen, Han Cai, Sanja Fidler, Ping Luo, Song Han, Enze Xie.',ve,z,dt="The abstract from the paper is:",Ue,R,pt='<em>We introduce SANA-Video, a small diffusion model that can efficiently generate videos up to 720x1280 resolution and minute-length duration. SANA-Video synthesizes high-resolution, high-quality and long videos with strong text-video alignment at a remarkably fast speed, deployable on RTX 5090 GPU. Two core designs ensure our efficient, effective and long video generation: (1) Linear DiT: We leverage linear attention as the core operation, which is more efficient than vanilla attention given the large number of tokens processed in video generation. (2) Constant-Memory KV cache for Block Linear Attention: we design block-wise autoregressive approach for long video generation by employing a constant-memory state, derived from the cumulative properties of linear attention. This KV cache provides the Linear DiT with global context at a fixed memory cost, eliminating the need for a traditional KV cache and enabling efficient, minute-long video generation. In addition, we explore effective data filters and model training strategies, narrowing the training cost to 12 days on 64 H100 GPUs, which is only 1% of the cost of MovieGen. Given its low cost, SANA-Video achieves competitive performance compared to modern state-of-the-art small diffusion models (e.g., Wan 2.1-1.3B and SkyReel-V2-1.3B) while being 16x faster in measured latency. Moreover, SANA-Video can be deployed on RTX 5090 GPUs with NVFP4 precision, accelerating the inference speed of generating a 5-second 720p video from 71s to 29s (2.4x speedup). In summary, SANA-Video enables low-cost, high-quality video generation. <a href="https://github.com/NVlabs/SANA" rel="nofollow">this https URL</a>.</em>',je,E,ct='This pipeline was contributed by SANA Team. The original codebase can be found <a href="https://github.com/NVlabs/Sana" rel="nofollow">here</a>. The original weights can be found under <a href="https://hf.co/collections/Efficient-Large-Model/sana-video" rel="nofollow">hf.co/Efficient-Large-Model</a>.',Ze,F,mt="Available models:",Ge,$,ut='<thead><tr><th align="center">Model</th> <th align="center">Recommended dtype</th></tr></thead> <tbody><tr><td align="center"><a href="https://huggingface.co/Efficient-Large-Model/ANA-Video_2B_480p_diffusers" rel="nofollow"><code>Efficient-Large-Model/SANA-Video_2B_480p_diffusers</code></a></td> <td align="center"><code>torch.bfloat16</code></td></tr></tbody>',Ie,H,ht='Refer to <a href="https://huggingface.co/collections/Efficient-Large-Model/sana-video" rel="nofollow">this</a> collection for more information.',Be,P,gt="Note: The recommended dtype mentioned is for the transformer weights. The text encoder and VAE weights must stay in <code>torch.bfloat16</code> or <code>torch.float32</code> for the model to work correctly. Please refer to the inference example below to see how to load the model with the recommended dtype.",Ve,A,We,x,xe,q,Ce,L,ft="Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.",ke,D,Mt='Refer to the <a href="../../quantization/overview">Quantization</a> overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized <a href="/docs/diffusers/pr_12249/en/api/pipelines/sana_video#diffusers.SanaVideoPipeline">SanaVideoPipeline</a> for inference with bitsandbytes.',Se,O,Ne,K,Xe,J,ee,qe,me,_t=`Pipeline for text-to-video generation using <a href="https://huggingface.co/papers/2509.24695" rel="nofollow">Sana</a>. This model inherits | |
| from <a href="/docs/diffusers/pr_12249/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a>. Check the superclass documentation for the generic methods implemented for all | |
| pipelines (downloading, saving, running on a particular device, etc.).`,Le,Z,te,De,ue,yt="Function invoked when calling the pipeline for generation.",Oe,C,Ke,k,ne,et,he,bt="Encodes the prompt into text encoder hidden states.",Ye,oe,Qe,v,ae,tt,ge,wt=`Pipeline for image/text-to-video generation using <a href="https://huggingface.co/papers/2509.24695" rel="nofollow">Sana</a>. This model | |
| inherits from <a href="/docs/diffusers/pr_12249/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a>. Check the superclass documentation for the generic methods implemented for all | |
| pipelines (downloading, saving, running on a particular device, etc.).`,nt,G,se,ot,fe,Tt="Function invoked when calling the pipeline for generation.",at,S,st,N,ie,it,Me,Jt="Encodes the prompt into text encoder hidden states.",ze,le,Re,B,re,lt,_e,vt="Output class for Sana-Video pipelines.",Ee,de,Fe,Te,$e;return p=new be({props:{title:"Sana-Video",local:"sana-video",headingTag:"h1"}}),A=new be({props:{title:"Generation Pipelines",local:"generation-pipelines",headingTag:"h2"}}),x=new kt({props:{id:"generation pipelines",options:["Text-to-Video","Image-to-Video"],$$slots:{default:[Xt]},$$scope:{ctx:U}}}),q=new be({props:{title:"Quantization",local:"quantization",headingTag:"h2"}}),O=new Je({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwQml0c0FuZEJ5dGVzQ29uZmlnJTIwYXMlMjBEaWZmdXNlcnNCaXRzQW5kQnl0ZXNDb25maWclMkMlMjBTYW5hVmlkZW9UcmFuc2Zvcm1lcjNETW9kZWwlMkMlMjBTYW5hVmlkZW9QaXBlbGluZSUwQWZyb20lMjB0cmFuc2Zvcm1lcnMlMjBpbXBvcnQlMjBCaXRzQW5kQnl0ZXNDb25maWclMjBhcyUyMEJpdHNBbmRCeXRlc0NvbmZpZyUyQyUyMEF1dG9Nb2RlbCUwQSUwQXF1YW50X2NvbmZpZyUyMCUzRCUyMEJpdHNBbmRCeXRlc0NvbmZpZyhsb2FkX2luXzhiaXQlM0RUcnVlKSUwQXRleHRfZW5jb2Rlcl84Yml0JTIwJTNEJTIwQXV0b01vZGVsLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJFZmZpY2llbnQtTGFyZ2UtTW9kZWwlMkZTQU5BLVZpZGVvXzJCXzQ4MHBfZGlmZnVzZXJzJTIyJTJDJTBBJTIwJTIwJTIwJTIwc3ViZm9sZGVyJTNEJTIydGV4dF9lbmNvZGVyJTIyJTJDJTBBJTIwJTIwJTIwJTIwcXVhbnRpemF0aW9uX2NvbmZpZyUzRHF1YW50X2NvbmZpZyUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSklMEElMEFxdWFudF9jb25maWclMjAlM0QlMjBEaWZmdXNlcnNCaXRzQW5kQnl0ZXNDb25maWcobG9hZF9pbl84Yml0JTNEVHJ1ZSklMEF0cmFuc2Zvcm1lcl84Yml0JTIwJTNEJTIwU2FuYVZpZGVvVHJhbnNmb3JtZXIzRE1vZGVsLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJFZmZpY2llbnQtTGFyZ2UtTW9kZWwlMkZTQU5BLVZpZGVvXzJCXzQ4MHBfZGlmZnVzZXJzJTIyJTJDJTBBJTIwJTIwJTIwJTIwc3ViZm9sZGVyJTNEJTIydHJhbnNmb3JtZXIlMjIlMkMlMEElMjAlMjAlMjAlMjBxdWFudGl6YXRpb25fY29uZmlnJTNEcXVhbnRfY29uZmlnJTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2JTJDJTBBKSUwQSUwQXBpcGVsaW5lJTIwJTNEJTIwU2FuYVZpZGVvUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMkVmZmljaWVudC1MYXJnZS1Nb2RlbCUyRlNBTkEtVmlkZW9fMkJfNDgwcF9kaWZmdXNlcnMlMjIlMkMlMEElMjAlMjAlMjAlMjB0ZXh0X2VuY29kZXIlM0R0ZXh0X2VuY29kZXJfOGJpdCUyQyUwQSUyMCUyMCUyMCUyMHRyYW5zZm9ybWVyJTNEdHJhbnNmb3JtZXJfOGJpdCUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSUyMCUyMCUyMCUyMGRldmljZV9tYXAlM0QlMjJiYWxhbmNlZCUyMiUyQyUwQSklMEElMEFtb2RlbF9zY29yZSUyMCUzRCUyMDMwJTBBcHJvbXB0JTIwJTNEJTIwJTIyRXZlbmluZyUyQyUyMGJhY2tsaWdodCUyQyUyMHNpZGUlMjBsaWdodGluZyUyQyUyMHNvZnQlMjBsaWdodCUyQyUyMGhpZ2glMjBjb250cmFzdCUyQyUyMG1pZC1zaG90JTJDJTIwY2VudGVyZWQlMjBjb21wb3NpdGlvbiUyQyUyMGNsZWFuJTIwc29sbyUyMHNob3QlMkMlMjB3YXJtJTIwY29sb3IuJTIwQSUyMHlvdW5nJTIwQ2F1Y2FzaWFuJTIwbWFuJTIwc3RhbmRzJTIwaW4lMjBhJTIwZm9yZXN0JTJDJTIwZ29sZGVuJTIwbGlnaHQlMjBnbGltbWVycyUyMG9uJTIwaGlzJTIwaGFpciUyMGFzJTIwc3VubGlnaHQlMjBmaWx0ZXJzJTIwdGhyb3VnaCUyMHRoZSUyMGxlYXZlcy4lMjBIZSUyMHdlYXJzJTIwYSUyMGxpZ2h0JTIwc2hpcnQlMkMlMjB3aW5kJTIwZ2VudGx5JTIwYmxvd2luZyUyMGhpcyUyMGhhaXIlMjBhbmQlMjBjb2xsYXIlMkMlMjBsaWdodCUyMGRhbmNlcyUyMGFjcm9zcyUyMGhpcyUyMGZhY2UlMjB3aXRoJTIwaGlzJTIwbW92ZW1lbnRzLiUyMFRoZSUyMGJhY2tncm91bmQlMjBpcyUyMGJsdXJyZWQlMkMlMjB3aXRoJTIwZGFwcGxlZCUyMGxpZ2h0JTIwYW5kJTIwc29mdCUyMHRyZWUlMjBzaGFkb3dzJTIwaW4lMjB0aGUlMjBkaXN0YW5jZS4lMjBUaGUlMjBjYW1lcmElMjBmb2N1c2VzJTIwb24lMjBoaXMlMjBsaWZ0ZWQlMjBnYXplJTJDJTIwY2xlYXIlMjBhbmQlMjBlbW90aW9uYWwuJTIyJTBBbmVnYXRpdmVfcHJvbXB0JTIwJTNEJTIwJTIyQSUyMGNoYW90aWMlMjBzZXF1ZW5jZSUyMHdpdGglMjBtaXNzaGFwZW4lMkMlMjBkZWZvcm1lZCUyMGxpbWJzJTIwaW4lMjBoZWF2eSUyMG1vdGlvbiUyMGJsdXIlMkMlMjBzdWRkZW4lMjBkaXNhcHBlYXJhbmNlJTJDJTIwanVtcCUyMGN1dHMlMkMlMjBqZXJreSUyMG1vdmVtZW50cyUyQyUyMHJhcGlkJTIwc2hvdCUyMGNoYW5nZXMlMkMlMjBmcmFtZXMlMjBvdXQlMjBvZiUyMHN5bmMlMkMlMjBpbmNvbnNpc3RlbnQlMjBjaGFyYWN0ZXIlMjBzaGFwZXMlMkMlMjB0ZW1wb3JhbCUyMGFydGlmYWN0cyUyQyUyMGppdHRlciUyQyUyMGFuZCUyMGdob3N0aW5nJTIwZWZmZWN0cyUyQyUyMGNyZWF0aW5nJTIwYSUyMGRpc29yaWVudGluZyUyMHZpc3VhbCUyMGV4cGVyaWVuY2UuJTIyJTBBbW90aW9uX3Byb21wdCUyMCUzRCUyMGYlMjIlMjBtb3Rpb24lMjBzY29yZSUzQSUyMCU3Qm1vZGVsX3Njb3JlJTdELiUyMiUwQXByb21wdCUyMCUzRCUyMHByb21wdCUyMCUyQiUyMG1vdGlvbl9wcm9tcHQlMEElMEFvdXRwdXQlMjAlM0QlMjBwaXBlbGluZSglMEElMjAlMjAlMjAlMjBwcm9tcHQlM0Rwcm9tcHQlMkMlMEElMjAlMjAlMjAlMjBuZWdhdGl2ZV9wcm9tcHQlM0RuZWdhdGl2ZV9wcm9tcHQlMkMlMEElMjAlMjAlMjAlMjBoZWlnaHQlM0Q0ODAlMkMlMEElMjAlMjAlMjAlMjB3aWR0aCUzRDgzMiUyQyUwQSUyMCUyMCUyMCUyMG51bV9mcmFtZXMlM0Q4MSUyQyUwQSUyMCUyMCUyMCUyMGd1aWRhbmNlX3NjYWxlJTNENi4wJTJDJTBBJTIwJTIwJTIwJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDUwJTBBKS5mcmFtZXMlNUIwJTVEJTBBZXhwb3J0X3RvX3ZpZGVvKG91dHB1dCUyQyUyMCUyMnNhbmEtdmlkZW8tb3V0cHV0Lm1wNCUyMiUyQyUyMGZwcyUzRDE2KQ==",highlighted:`<span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> BitsAndBytesConfig <span class="hljs-keyword">as</span> DiffusersBitsAndBytesConfig, SanaVideoTransformer3DModel, SanaVideoPipeline | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> BitsAndBytesConfig <span class="hljs-keyword">as</span> BitsAndBytesConfig, AutoModel | |
| quant_config = BitsAndBytesConfig(load_in_8bit=<span class="hljs-literal">True</span>) | |
| text_encoder_8bit = AutoModel.from_pretrained( | |
| <span class="hljs-string">"Efficient-Large-Model/SANA-Video_2B_480p_diffusers"</span>, | |
| subfolder=<span class="hljs-string">"text_encoder"</span>, | |
| quantization_config=quant_config, | |
| torch_dtype=torch.float16, | |
| ) | |
| quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=<span class="hljs-literal">True</span>) | |
| transformer_8bit = SanaVideoTransformer3DModel.from_pretrained( | |
| <span class="hljs-string">"Efficient-Large-Model/SANA-Video_2B_480p_diffusers"</span>, | |
| subfolder=<span class="hljs-string">"transformer"</span>, | |
| quantization_config=quant_config, | |
| torch_dtype=torch.float16, | |
| ) | |
| pipeline = SanaVideoPipeline.from_pretrained( | |
| <span class="hljs-string">"Efficient-Large-Model/SANA-Video_2B_480p_diffusers"</span>, | |
| text_encoder=text_encoder_8bit, | |
| transformer=transformer_8bit, | |
| torch_dtype=torch.float16, | |
| device_map=<span class="hljs-string">"balanced"</span>, | |
| ) | |
| model_score = <span class="hljs-number">30</span> | |
| prompt = <span class="hljs-string">"Evening, backlight, side lighting, soft light, high contrast, mid-shot, centered composition, clean solo shot, warm color. A young Caucasian man stands in a forest, golden light glimmers on his hair as sunlight filters through the leaves. He wears a light shirt, wind gently blowing his hair and collar, light dances across his face with his movements. The background is blurred, with dappled light and soft tree shadows in the distance. The camera focuses on his lifted gaze, clear and emotional."</span> | |
| negative_prompt = <span class="hljs-string">"A chaotic sequence with misshapen, deformed limbs in heavy motion blur, sudden disappearance, jump cuts, jerky movements, rapid shot changes, frames out of sync, inconsistent character shapes, temporal artifacts, jitter, and ghosting effects, creating a disorienting visual experience."</span> | |
| motion_prompt = <span class="hljs-string">f" motion score: <span class="hljs-subst">{model_score}</span>."</span> | |
| prompt = prompt + motion_prompt | |
| output = pipeline( | |
| prompt=prompt, | |
| negative_prompt=negative_prompt, | |
| height=<span class="hljs-number">480</span>, | |
| width=<span class="hljs-number">832</span>, | |
| num_frames=<span class="hljs-number">81</span>, | |
| guidance_scale=<span class="hljs-number">6.0</span>, | |
| num_inference_steps=<span class="hljs-number">50</span> | |
| ).frames[<span class="hljs-number">0</span>] | |
| export_to_video(output, <span class="hljs-string">"sana-video-output.mp4"</span>, fps=<span class="hljs-number">16</span>)`,wrap:!1}}),K=new be({props:{title:"SanaVideoPipeline",local:"diffusers.SanaVideoPipeline",headingTag:"h2"}}),ee=new ce({props:{name:"class diffusers.SanaVideoPipeline",anchor:"diffusers.SanaVideoPipeline",parameters:[{name:"tokenizer",val:": typing.Union[transformers.models.gemma.tokenization_gemma.GemmaTokenizer, transformers.models.gemma.tokenization_gemma_fast.GemmaTokenizerFast]"},{name:"text_encoder",val:": Gemma2PreTrainedModel"},{name:"vae",val:": typing.Union[diffusers.models.autoencoders.autoencoder_dc.AutoencoderDC, diffusers.models.autoencoders.autoencoder_kl_wan.AutoencoderKLWan]"},{name:"transformer",val:": SanaVideoTransformer3DModel"},{name:"scheduler",val:": DPMSolverMultistepScheduler"}],parametersDescription:[{anchor:"diffusers.SanaVideoPipeline.tokenizer",description:`<strong>tokenizer</strong> (<code>GemmaTokenizer</code> or <code>GemmaTokenizerFast</code>) — | |
| The tokenizer used to tokenize the prompt.`,name:"tokenizer"},{anchor:"diffusers.SanaVideoPipeline.text_encoder",description:`<strong>text_encoder</strong> (<code>Gemma2PreTrainedModel</code>) — | |
| Text encoder model to encode the input prompts.`,name:"text_encoder"},{anchor:"diffusers.SanaVideoPipeline.vae",description:`<strong>vae</strong> ([<code>AutoencoderKLWan</code> or <code>AutoencoderDCAEV</code>]) — | |
| Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.`,name:"vae"},{anchor:"diffusers.SanaVideoPipeline.transformer",description:`<strong>transformer</strong> (<a href="/docs/diffusers/pr_12249/en/api/models/sana_video_transformer3d#diffusers.SanaVideoTransformer3DModel">SanaVideoTransformer3DModel</a>) — | |
| Conditional Transformer to denoise the input latents.`,name:"transformer"},{anchor:"diffusers.SanaVideoPipeline.scheduler",description:`<strong>scheduler</strong> (<a href="/docs/diffusers/pr_12249/en/api/schedulers/multistep_dpm_solver#diffusers.DPMSolverMultistepScheduler">DPMSolverMultistepScheduler</a>) — | |
| A scheduler to be used in combination with <code>transformer</code> to denoise the encoded video latents.`,name:"scheduler"}],source:"https://github.com/huggingface/diffusers/blob/vr_12249/src/diffusers/pipelines/sana_video/pipeline_sana_video.py#L186"}}),te=new ce({props:{name:"__call__",anchor:"diffusers.SanaVideoPipeline.__call__",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]] = None"},{name:"negative_prompt",val:": str = ''"},{name:"num_inference_steps",val:": int = 50"},{name:"timesteps",val:": typing.List[int] = None"},{name:"sigmas",val:": typing.List[float] = None"},{name:"guidance_scale",val:": float = 6.0"},{name:"num_videos_per_prompt",val:": typing.Optional[int] = 1"},{name:"height",val:": int = 480"},{name:"width",val:": int = 832"},{name:"frames",val:": int = 81"},{name:"eta",val:": float = 0.0"},{name:"generator",val:": typing.Union[torch._C.Generator, typing.List[torch._C.Generator], NoneType] = None"},{name:"latents",val:": typing.Optional[torch.Tensor] = None"},{name:"prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"prompt_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"output_type",val:": typing.Optional[str] = 'pil'"},{name:"return_dict",val:": bool = True"},{name:"clean_caption",val:": bool = False"},{name:"use_resolution_binning",val:": bool = True"},{name:"attention_kwargs",val:": typing.Optional[typing.Dict[str, typing.Any]] = None"},{name:"callback_on_step_end",val:": typing.Optional[typing.Callable[[int, int, typing.Dict], NoneType]] = None"},{name:"callback_on_step_end_tensor_inputs",val:": typing.List[str] = ['latents']"},{name:"max_sequence_length",val:": int = 300"},{name:"complex_human_instruction",val:`: typing.List[str] = ["Given a user prompt, generate an 'Enhanced prompt' that provides detailed visual descriptions suitable for video generation. Evaluate the level of detail in the user prompt:", '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, motion, and temporal relationships to create vivid and dynamic scenes.', '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.', 'Here are examples of how to transform or refine prompts:', '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat slowly settling into a curled position, peacefully falling asleep on a warm sunny windowsill, with gentle sunlight filtering through surrounding pots of blooming red flowers.', '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps gradually lighting up, a diverse crowd of people in colorful clothing walking past, and a double-decker bus smoothly passing by towering glass skyscrapers.', 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:', 'User Prompt: ']`}],parametersDescription:[{anchor:"diffusers.SanaVideoPipeline.__call__.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) — | |
| The prompt or prompts to guide the video generation. If not defined, one has to pass <code>prompt_embeds</code>. | |
| instead.`,name:"prompt"},{anchor:"diffusers.SanaVideoPipeline.__call__.negative_prompt",description:`<strong>negative_prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) — | |
| The prompt or prompts not to guide the video generation. If not defined, one has to pass | |
| <code>negative_prompt_embeds</code> instead. Ignored when not using guidance (i.e., ignored if <code>guidance_scale</code> is | |
| less than <code>1</code>).`,name:"negative_prompt"},{anchor:"diffusers.SanaVideoPipeline.__call__.num_inference_steps",description:`<strong>num_inference_steps</strong> (<code>int</code>, <em>optional</em>, defaults to 50) — | |
| The number of denoising steps. More denoising steps usually lead to a higher quality video at the | |
| expense of slower inference.`,name:"num_inference_steps"},{anchor:"diffusers.SanaVideoPipeline.__call__.timesteps",description:`<strong>timesteps</strong> (<code>List[int]</code>, <em>optional</em>) — | |
| Custom timesteps to use for the denoising process with schedulers which support a <code>timesteps</code> argument | |
| in their <code>set_timesteps</code> method. If not defined, the default behavior when <code>num_inference_steps</code> is | |
| passed will be used. Must be in descending order.`,name:"timesteps"},{anchor:"diffusers.SanaVideoPipeline.__call__.sigmas",description:`<strong>sigmas</strong> (<code>List[float]</code>, <em>optional</em>) — | |
| Custom sigmas to use for the denoising process with schedulers which support a <code>sigmas</code> argument in | |
| their <code>set_timesteps</code> method. If not defined, the default behavior when <code>num_inference_steps</code> is passed | |
| will be used.`,name:"sigmas"},{anchor:"diffusers.SanaVideoPipeline.__call__.guidance_scale",description:`<strong>guidance_scale</strong> (<code>float</code>, <em>optional</em>, defaults to 4.5) — | |
| Guidance scale as defined in <a href="https://huggingface.co/papers/2207.12598" rel="nofollow">Classifier-Free Diffusion | |
| Guidance</a>. <code>guidance_scale</code> is defined as <code>w</code> of equation 2. | |
| of <a href="https://huggingface.co/papers/2205.11487" rel="nofollow">Imagen Paper</a>. Guidance scale is enabled by setting | |
| <code>guidance_scale > 1</code>. Higher guidance scale encourages to generate videos that are closely linked to | |
| the text <code>prompt</code>, usually at the expense of lower video quality.`,name:"guidance_scale"},{anchor:"diffusers.SanaVideoPipeline.__call__.num_videos_per_prompt",description:`<strong>num_videos_per_prompt</strong> (<code>int</code>, <em>optional</em>, defaults to 1) — | |
| The number of videos to generate per prompt.`,name:"num_videos_per_prompt"},{anchor:"diffusers.SanaVideoPipeline.__call__.height",description:`<strong>height</strong> (<code>int</code>, <em>optional</em>, defaults to 480) — | |
| The height in pixels of the generated video.`,name:"height"},{anchor:"diffusers.SanaVideoPipeline.__call__.width",description:`<strong>width</strong> (<code>int</code>, <em>optional</em>, defaults to 832) — | |
| The width in pixels of the generated video.`,name:"width"},{anchor:"diffusers.SanaVideoPipeline.__call__.frames",description:`<strong>frames</strong> (<code>int</code>, <em>optional</em>, defaults to 81) — | |
| The number of frames in the generated video.`,name:"frames"},{anchor:"diffusers.SanaVideoPipeline.__call__.eta",description:`<strong>eta</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) — | |
| Corresponds to parameter eta (η) in the DDIM paper: <a href="https://huggingface.co/papers/2010.02502" rel="nofollow">https://huggingface.co/papers/2010.02502</a>. Only | |
| applies to <a href="/docs/diffusers/pr_12249/en/api/schedulers/ddim#diffusers.DDIMScheduler">schedulers.DDIMScheduler</a>, will be ignored for others.`,name:"eta"},{anchor:"diffusers.SanaVideoPipeline.__call__.generator",description:`<strong>generator</strong> (<code>torch.Generator</code> or <code>List[torch.Generator]</code>, <em>optional</em>) — | |
| One or a list of <a href="https://pytorch.org/docs/stable/generated/torch.Generator.html" rel="nofollow">torch generator(s)</a> | |
| to make generation deterministic.`,name:"generator"},{anchor:"diffusers.SanaVideoPipeline.__call__.latents",description:`<strong>latents</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video | |
| generation. Can be used to tweak the same generation with different prompts. If not provided, a latents | |
| tensor will be generated by sampling using the supplied random <code>generator</code>.`,name:"latents"},{anchor:"diffusers.SanaVideoPipeline.__call__.prompt_embeds",description:`<strong>prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated text embeddings. Can be used to easily tweak text inputs, <em>e.g.</em> prompt weighting. If not | |
| provided, text embeddings will be generated from <code>prompt</code> input argument.`,name:"prompt_embeds"},{anchor:"diffusers.SanaVideoPipeline.__call__.prompt_attention_mask",description:"<strong>prompt_attention_mask</strong> (<code>torch.Tensor</code>, <em>optional</em>) — Pre-generated attention mask for text embeddings.",name:"prompt_attention_mask"},{anchor:"diffusers.SanaVideoPipeline.__call__.negative_prompt_embeds",description:`<strong>negative_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not | |
| provided, negative_prompt_embeds will be generated from <code>negative_prompt</code> input argument.`,name:"negative_prompt_embeds"},{anchor:"diffusers.SanaVideoPipeline.__call__.negative_prompt_attention_mask",description:`<strong>negative_prompt_attention_mask</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated attention mask for negative text embeddings.`,name:"negative_prompt_attention_mask"},{anchor:"diffusers.SanaVideoPipeline.__call__.output_type",description:`<strong>output_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"pil"</code>) — | |
| The output format of the generated video. Choose between mp4 or <code>np.array</code>.`,name:"output_type"},{anchor:"diffusers.SanaVideoPipeline.__call__.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) — | |
| Whether or not to return a <code>SanaVideoPipelineOutput</code> instead of a plain tuple.`,name:"return_dict"},{anchor:"diffusers.SanaVideoPipeline.__call__.attention_kwargs",description:`<strong>attention_kwargs</strong> — | |
| A kwargs dictionary that if specified is passed along to the <code>AttentionProcessor</code> as defined under | |
| <code>self.processor</code> in | |
| <a href="https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py" rel="nofollow">diffusers.models.attention_processor</a>.`,name:"attention_kwargs"},{anchor:"diffusers.SanaVideoPipeline.__call__.clean_caption",description:`<strong>clean_caption</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) — | |
| Whether or not to clean the caption before creating embeddings. Requires <code>beautifulsoup4</code> and <code>ftfy</code> to | |
| be installed. If the dependencies are not installed, the embeddings will be created from the raw | |
| prompt.`,name:"clean_caption"},{anchor:"diffusers.SanaVideoPipeline.__call__.use_resolution_binning",description:`<strong>use_resolution_binning</strong> (<code>bool</code> defaults to <code>True</code>) — | |
| If set to <code>True</code>, the requested height and width are first mapped to the closest resolutions using | |
| <code>ASPECT_RATIO_480_BIN</code> or <code>ASPECT_RATIO_720_BIN</code>. After the produced latents are decoded into videos, | |
| they are resized back to the requested resolution. Useful for generating non-square videos.`,name:"use_resolution_binning"},{anchor:"diffusers.SanaVideoPipeline.__call__.callback_on_step_end",description:`<strong>callback_on_step_end</strong> (<code>Callable</code>, <em>optional</em>) — | |
| A function that calls at the end of each denoising steps during the inference. The function is called | |
| with the following arguments: <code>callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)</code>. <code>callback_kwargs</code> will include a list of all tensors as specified by | |
| <code>callback_on_step_end_tensor_inputs</code>.`,name:"callback_on_step_end"},{anchor:"diffusers.SanaVideoPipeline.__call__.callback_on_step_end_tensor_inputs",description:`<strong>callback_on_step_end_tensor_inputs</strong> (<code>List</code>, <em>optional</em>) — | |
| The list of tensor inputs for the <code>callback_on_step_end</code> function. The tensors specified in the list | |
| will be passed as <code>callback_kwargs</code> argument. You will only be able to include variables listed in the | |
| <code>._callback_tensor_inputs</code> attribute of your pipeline class.`,name:"callback_on_step_end_tensor_inputs"},{anchor:"diffusers.SanaVideoPipeline.__call__.max_sequence_length",description:`<strong>max_sequence_length</strong> (<code>int</code> defaults to <code>300</code>) — | |
| Maximum sequence length to use with the <code>prompt</code>.`,name:"max_sequence_length"},{anchor:"diffusers.SanaVideoPipeline.__call__.complex_human_instruction",description:`<strong>complex_human_instruction</strong> (<code>List[str]</code>, <em>optional</em>) — | |
| Instructions for complex human attention: | |
| <a href="https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55" rel="nofollow">https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55</a>.`,name:"complex_human_instruction"}],source:"https://github.com/huggingface/diffusers/blob/vr_12249/src/diffusers/pipelines/sana_video/pipeline_sana_video.py#L701",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p>If <code>return_dict</code> is <code>True</code>, <a | |
| href="/docs/diffusers/pr_12249/en/api/pipelines/sana_video#diffusers.pipelines.sana_video.pipeline_output.SanaVideoPipelineOutput" | |
| >SanaVideoPipelineOutput</a> is | |
| returned, otherwise a <code>tuple</code> is returned where the first element is a list with the generated videos</p> | |
| `,returnType:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p><a | |
| href="/docs/diffusers/pr_12249/en/api/pipelines/sana_video#diffusers.pipelines.sana_video.pipeline_output.SanaVideoPipelineOutput" | |
| >SanaVideoPipelineOutput</a> or <code>tuple</code></p> | |
| `}}),C=new Ut({props:{anchor:"diffusers.SanaVideoPipeline.__call__.example",$$slots:{default:[Yt]},$$scope:{ctx:U}}}),ne=new ce({props:{name:"encode_prompt",anchor:"diffusers.SanaVideoPipeline.encode_prompt",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]]"},{name:"do_classifier_free_guidance",val:": bool = True"},{name:"negative_prompt",val:": str = ''"},{name:"num_videos_per_prompt",val:": int = 1"},{name:"device",val:": typing.Optional[torch.device] = None"},{name:"prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"prompt_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"clean_caption",val:": bool = False"},{name:"max_sequence_length",val:": int = 300"},{name:"complex_human_instruction",val:": typing.Optional[typing.List[str]] = None"},{name:"lora_scale",val:": typing.Optional[float] = None"}],parametersDescription:[{anchor:"diffusers.SanaVideoPipeline.encode_prompt.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) — | |
| prompt to be encoded`,name:"prompt"},{anchor:"diffusers.SanaVideoPipeline.encode_prompt.negative_prompt",description:`<strong>negative_prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) — | |
| The prompt not to guide the video generation. If not defined, one has to pass <code>negative_prompt_embeds</code> | |
| instead. Ignored when not using guidance (i.e., ignored if <code>guidance_scale</code> is less than <code>1</code>). For | |
| PixArt-Alpha, this should be "".`,name:"negative_prompt"},{anchor:"diffusers.SanaVideoPipeline.encode_prompt.do_classifier_free_guidance",description:`<strong>do_classifier_free_guidance</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) — | |
| whether to use classifier free guidance or not`,name:"do_classifier_free_guidance"},{anchor:"diffusers.SanaVideoPipeline.encode_prompt.num_videos_per_prompt",description:`<strong>num_videos_per_prompt</strong> (<code>int</code>, <em>optional</em>, defaults to 1) — | |
| number of videos that should be generated per prompt`,name:"num_videos_per_prompt"},{anchor:"diffusers.SanaVideoPipeline.encode_prompt.device",description:`<strong>device</strong> — (<code>torch.device</code>, <em>optional</em>): | |
| torch device to place the resulting embeddings on`,name:"device"},{anchor:"diffusers.SanaVideoPipeline.encode_prompt.prompt_embeds",description:`<strong>prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated text embeddings. Can be used to easily tweak text inputs, <em>e.g.</em> prompt weighting. If not | |
| provided, text embeddings will be generated from <code>prompt</code> input argument.`,name:"prompt_embeds"},{anchor:"diffusers.SanaVideoPipeline.encode_prompt.negative_prompt_embeds",description:`<strong>negative_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated negative text embeddings. For Sana, it’s should be the embeddings of the "" string.`,name:"negative_prompt_embeds"},{anchor:"diffusers.SanaVideoPipeline.encode_prompt.clean_caption",description:`<strong>clean_caption</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| If <code>True</code>, the function will preprocess and clean the provided caption before encoding.`,name:"clean_caption"},{anchor:"diffusers.SanaVideoPipeline.encode_prompt.max_sequence_length",description:"<strong>max_sequence_length</strong> (<code>int</code>, defaults to 300) — Maximum sequence length to use for the prompt.",name:"max_sequence_length"},{anchor:"diffusers.SanaVideoPipeline.encode_prompt.complex_human_instruction",description:`<strong>complex_human_instruction</strong> (<code>list[str]</code>, defaults to <code>complex_human_instruction</code>) — | |
| If <code>complex_human_instruction</code> is not empty, the function will use the complex Human instruction for | |
| the prompt.`,name:"complex_human_instruction"}],source:"https://github.com/huggingface/diffusers/blob/vr_12249/src/diffusers/pipelines/sana_video/pipeline_sana_video.py#L292"}}),oe=new be({props:{title:"SanaImageToVideoPipeline",local:"diffusers.SanaImageToVideoPipeline",headingTag:"h2"}}),ae=new ce({props:{name:"class diffusers.SanaImageToVideoPipeline",anchor:"diffusers.SanaImageToVideoPipeline",parameters:[{name:"tokenizer",val:": typing.Union[transformers.models.gemma.tokenization_gemma.GemmaTokenizer, transformers.models.gemma.tokenization_gemma_fast.GemmaTokenizerFast]"},{name:"text_encoder",val:": Gemma2PreTrainedModel"},{name:"vae",val:": typing.Union[diffusers.models.autoencoders.autoencoder_dc.AutoencoderDC, diffusers.models.autoencoders.autoencoder_kl_wan.AutoencoderKLWan]"},{name:"transformer",val:": SanaVideoTransformer3DModel"},{name:"scheduler",val:": FlowMatchEulerDiscreteScheduler"}],parametersDescription:[{anchor:"diffusers.SanaImageToVideoPipeline.tokenizer",description:`<strong>tokenizer</strong> (<code>GemmaTokenizer</code> or <code>GemmaTokenizerFast</code>) — | |
| The tokenizer used to tokenize the prompt.`,name:"tokenizer"},{anchor:"diffusers.SanaImageToVideoPipeline.text_encoder",description:`<strong>text_encoder</strong> (<code>Gemma2PreTrainedModel</code>) — | |
| Text encoder model to encode the input prompts.`,name:"text_encoder"},{anchor:"diffusers.SanaImageToVideoPipeline.vae",description:`<strong>vae</strong> ([<code>AutoencoderKLWan</code> or <code>AutoencoderDCAEV</code>]) — | |
| Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.`,name:"vae"},{anchor:"diffusers.SanaImageToVideoPipeline.transformer",description:`<strong>transformer</strong> (<a href="/docs/diffusers/pr_12249/en/api/models/sana_video_transformer3d#diffusers.SanaVideoTransformer3DModel">SanaVideoTransformer3DModel</a>) — | |
| Conditional Transformer to denoise the input latents.`,name:"transformer"},{anchor:"diffusers.SanaImageToVideoPipeline.scheduler",description:`<strong>scheduler</strong> (<a href="/docs/diffusers/pr_12249/en/api/schedulers/flow_match_euler_discrete#diffusers.FlowMatchEulerDiscreteScheduler">FlowMatchEulerDiscreteScheduler</a>) — | |
| A scheduler to be used in combination with <code>transformer</code> to denoise the encoded video latents.`,name:"scheduler"}],source:"https://github.com/huggingface/diffusers/blob/vr_12249/src/diffusers/pipelines/sana_video/pipeline_sana_video_i2v.py#L176"}}),se=new ce({props:{name:"__call__",anchor:"diffusers.SanaImageToVideoPipeline.__call__",parameters:[{name:"image",val:": typing.Union[PIL.Image.Image, numpy.ndarray, torch.Tensor, typing.List[PIL.Image.Image], typing.List[numpy.ndarray], typing.List[torch.Tensor]]"},{name:"prompt",val:": typing.Union[str, typing.List[str]] = None"},{name:"negative_prompt",val:": str = ''"},{name:"num_inference_steps",val:": int = 50"},{name:"timesteps",val:": typing.List[int] = None"},{name:"sigmas",val:": typing.List[float] = None"},{name:"guidance_scale",val:": float = 6.0"},{name:"num_videos_per_prompt",val:": typing.Optional[int] = 1"},{name:"height",val:": int = 480"},{name:"width",val:": int = 832"},{name:"frames",val:": int = 81"},{name:"eta",val:": float = 0.0"},{name:"generator",val:": typing.Union[torch._C.Generator, typing.List[torch._C.Generator], NoneType] = None"},{name:"latents",val:": typing.Optional[torch.Tensor] = None"},{name:"prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"prompt_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"output_type",val:": typing.Optional[str] = 'pil'"},{name:"return_dict",val:": bool = True"},{name:"clean_caption",val:": bool = False"},{name:"use_resolution_binning",val:": bool = True"},{name:"attention_kwargs",val:": typing.Optional[typing.Dict[str, typing.Any]] = None"},{name:"callback_on_step_end",val:": typing.Optional[typing.Callable[[int, int, typing.Dict], NoneType]] = None"},{name:"callback_on_step_end_tensor_inputs",val:": typing.List[str] = ['latents']"},{name:"max_sequence_length",val:": int = 300"},{name:"complex_human_instruction",val:`: typing.List[str] = ["Given a user prompt, generate an 'Enhanced prompt' that provides detailed visual descriptions suitable for video generation. Evaluate the level of detail in the user prompt:", '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, motion, and temporal relationships to create vivid and dynamic scenes.', '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.', 'Here are examples of how to transform or refine prompts:', '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat slowly settling into a curled position, peacefully falling asleep on a warm sunny windowsill, with gentle sunlight filtering through surrounding pots of blooming red flowers.', '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps gradually lighting up, a diverse crowd of people in colorful clothing walking past, and a double-decker bus smoothly passing by towering glass skyscrapers.', 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:', 'User Prompt: ']`}],parametersDescription:[{anchor:"diffusers.SanaImageToVideoPipeline.__call__.image",description:`<strong>image</strong> (<code>PipelineImageInput</code>) — | |
| The input image to condition the video generation on. The first frame of the generated video will be | |
| conditioned on this image.`,name:"image"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) — | |
| The prompt or prompts to guide the video generation. If not defined, one has to pass <code>prompt_embeds</code>. | |
| instead.`,name:"prompt"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.negative_prompt",description:`<strong>negative_prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) — | |
| The prompt or prompts not to guide the video generation. If not defined, one has to pass | |
| <code>negative_prompt_embeds</code> instead. Ignored when not using guidance (i.e., ignored if <code>guidance_scale</code> is | |
| less than <code>1</code>).`,name:"negative_prompt"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.num_inference_steps",description:`<strong>num_inference_steps</strong> (<code>int</code>, <em>optional</em>, defaults to 50) — | |
| The number of denoising steps. More denoising steps usually lead to a higher quality video at the | |
| expense of slower inference.`,name:"num_inference_steps"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.timesteps",description:`<strong>timesteps</strong> (<code>List[int]</code>, <em>optional</em>) — | |
| Custom timesteps to use for the denoising process with schedulers which support a <code>timesteps</code> argument | |
| in their <code>set_timesteps</code> method. If not defined, the default behavior when <code>num_inference_steps</code> is | |
| passed will be used. Must be in descending order.`,name:"timesteps"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.sigmas",description:`<strong>sigmas</strong> (<code>List[float]</code>, <em>optional</em>) — | |
| Custom sigmas to use for the denoising process with schedulers which support a <code>sigmas</code> argument in | |
| their <code>set_timesteps</code> method. If not defined, the default behavior when <code>num_inference_steps</code> is passed | |
| will be used.`,name:"sigmas"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.guidance_scale",description:`<strong>guidance_scale</strong> (<code>float</code>, <em>optional</em>, defaults to 4.5) — | |
| Guidance scale as defined in <a href="https://huggingface.co/papers/2207.12598" rel="nofollow">Classifier-Free Diffusion | |
| Guidance</a>. <code>guidance_scale</code> is defined as <code>w</code> of equation 2. | |
| of <a href="https://huggingface.co/papers/2205.11487" rel="nofollow">Imagen Paper</a>. Guidance scale is enabled by setting | |
| <code>guidance_scale > 1</code>. Higher guidance scale encourages to generate videos that are closely linked to | |
| the text <code>prompt</code>, usually at the expense of lower video quality.`,name:"guidance_scale"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.num_videos_per_prompt",description:`<strong>num_videos_per_prompt</strong> (<code>int</code>, <em>optional</em>, defaults to 1) — | |
| The number of videos to generate per prompt.`,name:"num_videos_per_prompt"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.height",description:`<strong>height</strong> (<code>int</code>, <em>optional</em>, defaults to 480) — | |
| The height in pixels of the generated video.`,name:"height"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.width",description:`<strong>width</strong> (<code>int</code>, <em>optional</em>, defaults to 832) — | |
| The width in pixels of the generated video.`,name:"width"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.frames",description:`<strong>frames</strong> (<code>int</code>, <em>optional</em>, defaults to 81) — | |
| The number of frames in the generated video.`,name:"frames"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.eta",description:`<strong>eta</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) — | |
| Corresponds to parameter eta (η) in the DDIM paper: <a href="https://huggingface.co/papers/2010.02502" rel="nofollow">https://huggingface.co/papers/2010.02502</a>. Only | |
| applies to <a href="/docs/diffusers/pr_12249/en/api/schedulers/ddim#diffusers.DDIMScheduler">schedulers.DDIMScheduler</a>, will be ignored for others.`,name:"eta"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.generator",description:`<strong>generator</strong> (<code>torch.Generator</code> or <code>List[torch.Generator]</code>, <em>optional</em>) — | |
| One or a list of <a href="https://pytorch.org/docs/stable/generated/torch.Generator.html" rel="nofollow">torch generator(s)</a> | |
| to make generation deterministic.`,name:"generator"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.latents",description:`<strong>latents</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video | |
| generation. Can be used to tweak the same generation with different prompts. If not provided, a latents | |
| tensor will be generated by sampling using the supplied random <code>generator</code>.`,name:"latents"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.prompt_embeds",description:`<strong>prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated text embeddings. Can be used to easily tweak text inputs, <em>e.g.</em> prompt weighting. If not | |
| provided, text embeddings will be generated from <code>prompt</code> input argument.`,name:"prompt_embeds"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.prompt_attention_mask",description:"<strong>prompt_attention_mask</strong> (<code>torch.Tensor</code>, <em>optional</em>) — Pre-generated attention mask for text embeddings.",name:"prompt_attention_mask"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.negative_prompt_embeds",description:`<strong>negative_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not | |
| provided, negative_prompt_embeds will be generated from <code>negative_prompt</code> input argument.`,name:"negative_prompt_embeds"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.negative_prompt_attention_mask",description:`<strong>negative_prompt_attention_mask</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated attention mask for negative text embeddings.`,name:"negative_prompt_attention_mask"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.output_type",description:`<strong>output_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"pil"</code>) — | |
| The output format of the generated video. Choose between mp4 or <code>np.array</code>.`,name:"output_type"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) — | |
| Whether or not to return a <code>SanaVideoPipelineOutput</code> instead of a plain tuple.`,name:"return_dict"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.attention_kwargs",description:`<strong>attention_kwargs</strong> — | |
| A kwargs dictionary that if specified is passed along to the <code>AttentionProcessor</code> as defined under | |
| <code>self.processor</code> in | |
| <a href="https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py" rel="nofollow">diffusers.models.attention_processor</a>.`,name:"attention_kwargs"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.clean_caption",description:`<strong>clean_caption</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) — | |
| Whether or not to clean the caption before creating embeddings. Requires <code>beautifulsoup4</code> and <code>ftfy</code> to | |
| be installed. If the dependencies are not installed, the embeddings will be created from the raw | |
| prompt.`,name:"clean_caption"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.use_resolution_binning",description:`<strong>use_resolution_binning</strong> (<code>bool</code> defaults to <code>True</code>) — | |
| If set to <code>True</code>, the requested height and width are first mapped to the closest resolutions using | |
| <code>ASPECT_RATIO_480_BIN</code> or <code>ASPECT_RATIO_720_BIN</code>. After the produced latents are decoded into videos, | |
| they are resized back to the requested resolution. Useful for generating non-square videos.`,name:"use_resolution_binning"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.callback_on_step_end",description:`<strong>callback_on_step_end</strong> (<code>Callable</code>, <em>optional</em>) — | |
| A function that calls at the end of each denoising steps during the inference. The function is called | |
| with the following arguments: <code>callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)</code>. <code>callback_kwargs</code> will include a list of all tensors as specified by | |
| <code>callback_on_step_end_tensor_inputs</code>.`,name:"callback_on_step_end"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.callback_on_step_end_tensor_inputs",description:`<strong>callback_on_step_end_tensor_inputs</strong> (<code>List</code>, <em>optional</em>) — | |
| The list of tensor inputs for the <code>callback_on_step_end</code> function. The tensors specified in the list | |
| will be passed as <code>callback_kwargs</code> argument. You will only be able to include variables listed in the | |
| <code>._callback_tensor_inputs</code> attribute of your pipeline class.`,name:"callback_on_step_end_tensor_inputs"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.max_sequence_length",description:`<strong>max_sequence_length</strong> (<code>int</code> defaults to <code>300</code>) — | |
| Maximum sequence length to use with the <code>prompt</code>.`,name:"max_sequence_length"},{anchor:"diffusers.SanaImageToVideoPipeline.__call__.complex_human_instruction",description:`<strong>complex_human_instruction</strong> (<code>List[str]</code>, <em>optional</em>) — | |
| Instructions for complex human attention: | |
| <a href="https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55" rel="nofollow">https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55</a>.`,name:"complex_human_instruction"}],source:"https://github.com/huggingface/diffusers/blob/vr_12249/src/diffusers/pipelines/sana_video/pipeline_sana_video_i2v.py#L724",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p>If <code>return_dict</code> is <code>True</code>, <a | |
| href="/docs/diffusers/pr_12249/en/api/pipelines/sana_video#diffusers.pipelines.sana_video.pipeline_output.SanaVideoPipelineOutput" | |
| >SanaVideoPipelineOutput</a> is | |
| returned, otherwise a <code>tuple</code> is returned where the first element is a list with the generated videos</p> | |
| `,returnType:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p><a | |
| href="/docs/diffusers/pr_12249/en/api/pipelines/sana_video#diffusers.pipelines.sana_video.pipeline_output.SanaVideoPipelineOutput" | |
| >SanaVideoPipelineOutput</a> or <code>tuple</code></p> | |
| `}}),S=new Ut({props:{anchor:"diffusers.SanaImageToVideoPipeline.__call__.example",$$slots:{default:[Qt]},$$scope:{ctx:U}}}),ie=new ce({props:{name:"encode_prompt",anchor:"diffusers.SanaImageToVideoPipeline.encode_prompt",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]]"},{name:"do_classifier_free_guidance",val:": bool = True"},{name:"negative_prompt",val:": str = ''"},{name:"num_videos_per_prompt",val:": int = 1"},{name:"device",val:": typing.Optional[torch.device] = None"},{name:"prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"prompt_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"clean_caption",val:": bool = False"},{name:"max_sequence_length",val:": int = 300"},{name:"complex_human_instruction",val:": typing.Optional[typing.List[str]] = None"},{name:"lora_scale",val:": typing.Optional[float] = None"}],parametersDescription:[{anchor:"diffusers.SanaImageToVideoPipeline.encode_prompt.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) — | |
| prompt to be encoded`,name:"prompt"},{anchor:"diffusers.SanaImageToVideoPipeline.encode_prompt.negative_prompt",description:`<strong>negative_prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) — | |
| The prompt not to guide the video generation. If not defined, one has to pass <code>negative_prompt_embeds</code> | |
| instead. Ignored when not using guidance (i.e., ignored if <code>guidance_scale</code> is less than <code>1</code>). For | |
| PixArt-Alpha, this should be "".`,name:"negative_prompt"},{anchor:"diffusers.SanaImageToVideoPipeline.encode_prompt.do_classifier_free_guidance",description:`<strong>do_classifier_free_guidance</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) — | |
| whether to use classifier free guidance or not`,name:"do_classifier_free_guidance"},{anchor:"diffusers.SanaImageToVideoPipeline.encode_prompt.num_videos_per_prompt",description:`<strong>num_videos_per_prompt</strong> (<code>int</code>, <em>optional</em>, defaults to 1) — | |
| number of videos that should be generated per prompt`,name:"num_videos_per_prompt"},{anchor:"diffusers.SanaImageToVideoPipeline.encode_prompt.device",description:`<strong>device</strong> — (<code>torch.device</code>, <em>optional</em>): | |
| torch device to place the resulting embeddings on`,name:"device"},{anchor:"diffusers.SanaImageToVideoPipeline.encode_prompt.prompt_embeds",description:`<strong>prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated text embeddings. Can be used to easily tweak text inputs, <em>e.g.</em> prompt weighting. If not | |
| provided, text embeddings will be generated from <code>prompt</code> input argument.`,name:"prompt_embeds"},{anchor:"diffusers.SanaImageToVideoPipeline.encode_prompt.negative_prompt_embeds",description:`<strong>negative_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated negative text embeddings. For Sana, it’s should be the embeddings of the "" string.`,name:"negative_prompt_embeds"},{anchor:"diffusers.SanaImageToVideoPipeline.encode_prompt.clean_caption",description:`<strong>clean_caption</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| If <code>True</code>, the function will preprocess and clean the provided caption before encoding.`,name:"clean_caption"},{anchor:"diffusers.SanaImageToVideoPipeline.encode_prompt.max_sequence_length",description:"<strong>max_sequence_length</strong> (<code>int</code>, defaults to 300) — Maximum sequence length to use for the prompt.",name:"max_sequence_length"},{anchor:"diffusers.SanaImageToVideoPipeline.encode_prompt.complex_human_instruction",description:`<strong>complex_human_instruction</strong> (<code>list[str]</code>, defaults to <code>complex_human_instruction</code>) — | |
| If <code>complex_human_instruction</code> is not empty, the function will use the complex Human instruction for | |
| the prompt.`,name:"complex_human_instruction"}],source:"https://github.com/huggingface/diffusers/blob/vr_12249/src/diffusers/pipelines/sana_video/pipeline_sana_video_i2v.py#L290"}}),le=new be({props:{title:"SanaVideoPipelineOutput",local:"diffusers.pipelines.sana_video.pipeline_output.SanaVideoPipelineOutput",headingTag:"h2"}}),re=new ce({props:{name:"class diffusers.pipelines.sana_video.pipeline_output.SanaVideoPipelineOutput",anchor:"diffusers.pipelines.sana_video.pipeline_output.SanaVideoPipelineOutput",parameters:[{name:"frames",val:": Tensor"}],parametersDescription:[{anchor:"diffusers.pipelines.sana_video.pipeline_output.SanaVideoPipelineOutput.frames",description:`<strong>frames</strong> (<code>torch.Tensor</code>, <code>np.ndarray</code>, or List[List[PIL.Image.Image]]) — | |
| List of video outputs - It can be a nested list of length <code>batch_size,</code> with each sub-list containing | |
| denoised PIL image sequences of length <code>num_frames.</code> It can also be a NumPy array or Torch tensor of shape | |
| <code>(batch_size, num_frames, channels, height, width)</code>.`,name:"frames"}],source:"https://github.com/huggingface/diffusers/blob/vr_12249/src/diffusers/pipelines/sana_video/pipeline_output.py#L9"}}),de=new Ct({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/sana_video.md"}}),{c(){s=c("meta"),h=l(),u=c("p"),i=l(),f(p.$$.fragment),t=l(),d=c("div"),d.innerHTML=we,Y=l(),Q=c("p"),Q.innerHTML=rt,ve=l(),z=c("p"),z.textContent=dt,Ue=l(),R=c("p"),R.innerHTML=pt,je=l(),E=c("p"),E.innerHTML=ct,Ze=l(),F=c("p"),F.textContent=mt,Ge=l(),$=c("table"),$.innerHTML=ut,Ie=l(),H=c("p"),H.innerHTML=ht,Be=l(),P=c("p"),P.innerHTML=gt,Ve=l(),f(A.$$.fragment),We=l(),f(x.$$.fragment),xe=l(),f(q.$$.fragment),Ce=l(),L=c("p"),L.textContent=ft,ke=l(),D=c("p"),D.innerHTML=Mt,Se=l(),f(O.$$.fragment),Ne=l(),f(K.$$.fragment),Xe=l(),J=c("div"),f(ee.$$.fragment),qe=l(),me=c("p"),me.innerHTML=_t,Le=l(),Z=c("div"),f(te.$$.fragment),De=l(),ue=c("p"),ue.textContent=yt,Oe=l(),f(C.$$.fragment),Ke=l(),k=c("div"),f(ne.$$.fragment),et=l(),he=c("p"),he.textContent=bt,Ye=l(),f(oe.$$.fragment),Qe=l(),v=c("div"),f(ae.$$.fragment),tt=l(),ge=c("p"),ge.innerHTML=wt,nt=l(),G=c("div"),f(se.$$.fragment),ot=l(),fe=c("p"),fe.textContent=Tt,at=l(),f(S.$$.fragment),st=l(),N=c("div"),f(ie.$$.fragment),it=l(),Me=c("p"),Me.textContent=Jt,ze=l(),f(le.$$.fragment),Re=l(),B=c("div"),f(re.$$.fragment),lt=l(),_e=c("p"),_e.textContent=vt,Ee=l(),f(de.$$.fragment),Fe=l(),Te=c("p"),this.h()},l(e){const o=Vt("svelte-u9bgzb",document.head);s=m(o,"META",{name:!0,content:!0}),o.forEach(n),h=r(e),u=m(e,"P",{}),W(u).forEach(n),i=r(e),M(p.$$.fragment,e),t=r(e),d=m(e,"DIV",{class:!0,"data-svelte-h":!0}),T(d)!=="svelte-1elo7hh"&&(d.innerHTML=we),Y=r(e),Q=m(e,"P",{"data-svelte-h":!0}),T(Q)!=="svelte-1v2o059"&&(Q.innerHTML=rt),ve=r(e),z=m(e,"P",{"data-svelte-h":!0}),T(z)!=="svelte-1cwsb16"&&(z.textContent=dt),Ue=r(e),R=m(e,"P",{"data-svelte-h":!0}),T(R)!=="svelte-197sf2y"&&(R.innerHTML=pt),je=r(e),E=m(e,"P",{"data-svelte-h":!0}),T(E)!=="svelte-tnv2q5"&&(E.innerHTML=ct),Ze=r(e),F=m(e,"P",{"data-svelte-h":!0}),T(F)!=="svelte-1bob28v"&&(F.textContent=mt),Ge=r(e),$=m(e,"TABLE",{"data-svelte-h":!0}),T($)!=="svelte-1y7nu0d"&&($.innerHTML=ut),Ie=r(e),H=m(e,"P",{"data-svelte-h":!0}),T(H)!=="svelte-126jk2h"&&(H.innerHTML=ht),Be=r(e),P=m(e,"P",{"data-svelte-h":!0}),T(P)!=="svelte-okwwje"&&(P.innerHTML=gt),Ve=r(e),M(A.$$.fragment,e),We=r(e),M(x.$$.fragment,e),xe=r(e),M(q.$$.fragment,e),Ce=r(e),L=m(e,"P",{"data-svelte-h":!0}),T(L)!=="svelte-1ou2pxc"&&(L.textContent=ft),ke=r(e),D=m(e,"P",{"data-svelte-h":!0}),T(D)!=="svelte-1jvzwl5"&&(D.innerHTML=Mt),Se=r(e),M(O.$$.fragment,e),Ne=r(e),M(K.$$.fragment,e),Xe=r(e),J=m(e,"DIV",{class:!0});var j=W(J);M(ee.$$.fragment,j),qe=r(j),me=m(j,"P",{"data-svelte-h":!0}),T(me)!=="svelte-sdkp0c"&&(me.innerHTML=_t),Le=r(j),Z=m(j,"DIV",{class:!0});var V=W(Z);M(te.$$.fragment,V),De=r(V),ue=m(V,"P",{"data-svelte-h":!0}),T(ue)!=="svelte-v78lg8"&&(ue.textContent=yt),Oe=r(V),M(C.$$.fragment,V),V.forEach(n),Ke=r(j),k=m(j,"DIV",{class:!0});var pe=W(k);M(ne.$$.fragment,pe),et=r(pe),he=m(pe,"P",{"data-svelte-h":!0}),T(he)!=="svelte-16q0ax1"&&(he.textContent=bt),pe.forEach(n),j.forEach(n),Ye=r(e),M(oe.$$.fragment,e),Qe=r(e),v=m(e,"DIV",{class:!0});var X=W(v);M(ae.$$.fragment,X),tt=r(X),ge=m(X,"P",{"data-svelte-h":!0}),T(ge)!=="svelte-1jiydhq"&&(ge.innerHTML=wt),nt=r(X),G=m(X,"DIV",{class:!0});var ye=W(G);M(se.$$.fragment,ye),ot=r(ye),fe=m(ye,"P",{"data-svelte-h":!0}),T(fe)!=="svelte-v78lg8"&&(fe.textContent=Tt),at=r(ye),M(S.$$.fragment,ye),ye.forEach(n),st=r(X),N=m(X,"DIV",{class:!0});var He=W(N);M(ie.$$.fragment,He),it=r(He),Me=m(He,"P",{"data-svelte-h":!0}),T(Me)!=="svelte-16q0ax1"&&(Me.textContent=Jt),He.forEach(n),X.forEach(n),ze=r(e),M(le.$$.fragment,e),Re=r(e),B=m(e,"DIV",{class:!0});var Pe=W(B);M(re.$$.fragment,Pe),lt=r(Pe),_e=m(Pe,"P",{"data-svelte-h":!0}),T(_e)!=="svelte-118o1xs"&&(_e.textContent=vt),Pe.forEach(n),Ee=r(e),M(de.$$.fragment,e),Fe=r(e),Te=m(e,"P",{}),W(Te).forEach(n),this.h()},h(){I(s,"name","hf:doc:metadata"),I(s,"content",Rt),I(d,"class","flex flex-wrap space-x-1"),I(Z,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),I(k,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),I(J,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),I(G,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),I(N,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),I(v,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),I(B,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,o){g(document.head,s),a(e,h,o),a(e,u,o),a(e,i,o),_(p,e,o),a(e,t,o),a(e,d,o),a(e,Y,o),a(e,Q,o),a(e,ve,o),a(e,z,o),a(e,Ue,o),a(e,R,o),a(e,je,o),a(e,E,o),a(e,Ze,o),a(e,F,o),a(e,Ge,o),a(e,$,o),a(e,Ie,o),a(e,H,o),a(e,Be,o),a(e,P,o),a(e,Ve,o),_(A,e,o),a(e,We,o),_(x,e,o),a(e,xe,o),_(q,e,o),a(e,Ce,o),a(e,L,o),a(e,ke,o),a(e,D,o),a(e,Se,o),_(O,e,o),a(e,Ne,o),_(K,e,o),a(e,Xe,o),a(e,J,o),_(ee,J,null),g(J,qe),g(J,me),g(J,Le),g(J,Z),_(te,Z,null),g(Z,De),g(Z,ue),g(Z,Oe),_(C,Z,null),g(J,Ke),g(J,k),_(ne,k,null),g(k,et),g(k,he),a(e,Ye,o),_(oe,e,o),a(e,Qe,o),a(e,v,o),_(ae,v,null),g(v,tt),g(v,ge),g(v,nt),g(v,G),_(se,G,null),g(G,ot),g(G,fe),g(G,at),_(S,G,null),g(v,st),g(v,N),_(ie,N,null),g(N,it),g(N,Me),a(e,ze,o),_(le,e,o),a(e,Re,o),a(e,B,o),_(re,B,null),g(B,lt),g(B,_e),a(e,Ee,o),_(de,e,o),a(e,Fe,o),a(e,Te,o),$e=!0},p(e,[o]){const j={};o&2&&(j.$$scope={dirty:o,ctx:e}),x.$set(j);const V={};o&2&&(V.$$scope={dirty:o,ctx:e}),C.$set(V);const pe={};o&2&&(pe.$$scope={dirty:o,ctx:e}),S.$set(pe)},i(e){$e||(y(p.$$.fragment,e),y(A.$$.fragment,e),y(x.$$.fragment,e),y(q.$$.fragment,e),y(O.$$.fragment,e),y(K.$$.fragment,e),y(ee.$$.fragment,e),y(te.$$.fragment,e),y(C.$$.fragment,e),y(ne.$$.fragment,e),y(oe.$$.fragment,e),y(ae.$$.fragment,e),y(se.$$.fragment,e),y(S.$$.fragment,e),y(ie.$$.fragment,e),y(le.$$.fragment,e),y(re.$$.fragment,e),y(de.$$.fragment,e),$e=!0)},o(e){b(p.$$.fragment,e),b(A.$$.fragment,e),b(x.$$.fragment,e),b(q.$$.fragment,e),b(O.$$.fragment,e),b(K.$$.fragment,e),b(ee.$$.fragment,e),b(te.$$.fragment,e),b(C.$$.fragment,e),b(ne.$$.fragment,e),b(oe.$$.fragment,e),b(ae.$$.fragment,e),b(se.$$.fragment,e),b(S.$$.fragment,e),b(ie.$$.fragment,e),b(le.$$.fragment,e),b(re.$$.fragment,e),b(de.$$.fragment,e),$e=!1},d(e){e&&(n(h),n(u),n(i),n(t),n(d),n(Y),n(Q),n(ve),n(z),n(Ue),n(R),n(je),n(E),n(Ze),n(F),n(Ge),n($),n(Ie),n(H),n(Be),n(P),n(Ve),n(We),n(xe),n(Ce),n(L),n(ke),n(D),n(Se),n(Ne),n(Xe),n(J),n(Ye),n(Qe),n(v),n(ze),n(Re),n(B),n(Ee),n(Fe),n(Te)),n(s),w(p,e),w(A,e),w(x,e),w(q,e),w(O,e),w(K,e),w(ee),w(te),w(C),w(ne),w(oe,e),w(ae),w(se),w(S),w(ie),w(le,e),w(re),w(de,e)}}}const Rt='{"title":"Sana-Video","local":"sana-video","sections":[{"title":"Generation Pipelines","local":"generation-pipelines","sections":[],"depth":2},{"title":"Quantization","local":"quantization","sections":[],"depth":2},{"title":"SanaVideoPipeline","local":"diffusers.SanaVideoPipeline","sections":[],"depth":2},{"title":"SanaImageToVideoPipeline","local":"diffusers.SanaImageToVideoPipeline","sections":[],"depth":2},{"title":"SanaVideoPipelineOutput","local":"diffusers.pipelines.sana_video.pipeline_output.SanaVideoPipelineOutput","sections":[],"depth":2}],"depth":1}';function Et(U){return Gt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Dt extends It{constructor(s){super(),Bt(this,s,Et,zt,Zt,{})}}export{Dt as component}; | |
Xet Storage Details
- Size:
- 88.4 kB
- Xet hash:
- 1a6641e8b1e91aeda5de7a02392b59bd23264730c7a40191aafc27df5848ac0f
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.