Buckets:

rtrm's picture
download
raw
36.4 kB
import{s as lt,o as pt,n as rt}from"../chunks/scheduler.8c3d61f6.js";import{S as dt,i as ct,g as r,s as a,r as m,A as mt,h as l,f as n,c as s,j as re,u as f,x as d,k as le,y as T,a as o,v as u,d as h,t as g,w as _}from"../chunks/index.da70eac4.js";import{T as ft}from"../chunks/Tip.1d9b8c37.js";import{D as Qe}from"../chunks/Docstring.c021b19a.js";import{C as se}from"../chunks/CodeBlock.a9c4becf.js";import{E as ut}from"../chunks/ExampleCodeBlock.56b4589c.js";import{H as Ge,E as ht}from"../chunks/getInferenceSnippets.725ed3d4.js";function gt(ee){let i,J='Make sure to check out the Schedulers <a href="../../using-diffusers/schedulers">guide</a> to learn how to explore the tradeoff between scheduler speed and quality, and see the <a href="../../using-diffusers/loading#reuse-a-pipeline">reuse components across pipelines</a> section to learn how to efficiently load the same components into multiple pipelines.';return{c(){i=r("p"),i.innerHTML=J},l(c){i=l(c,"P",{"data-svelte-h":!0}),d(i)!=="svelte-1qn15hi"&&(i.innerHTML=J)},m(c,v){o(c,i,v)},p:rt,d(c){c&&n(i)}}}function _t(ee){let i,J="Examples:",c,v,w;return v=new se({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwTGF0dGVQaXBlbGluZSUwQWZyb20lMjBkaWZmdXNlcnMudXRpbHMlMjBpbXBvcnQlMjBleHBvcnRfdG9fZ2lmJTBBJTBBJTIzJTIwWW91JTIwY2FuJTIwcmVwbGFjZSUyMHRoZSUyMGNoZWNrcG9pbnQlMjBpZCUyMHdpdGglMjAlMjJtYXhpbi1jbiUyRkxhdHRlLTElMjIlMjB0b28uJTBBcGlwZSUyMCUzRCUyMExhdHRlUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUyMm1heGluLWNuJTJGTGF0dGUtMSUyMiUyQyUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiklMEElMjMlMjBFbmFibGUlMjBtZW1vcnklMjBvcHRpbWl6YXRpb25zLiUwQXBpcGUuZW5hYmxlX21vZGVsX2NwdV9vZmZsb2FkKCklMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJBJTIwc21hbGwlMjBjYWN0dXMlMjB3aXRoJTIwYSUyMGhhcHB5JTIwZmFjZSUyMGluJTIwdGhlJTIwU2FoYXJhJTIwZGVzZXJ0LiUyMiUwQXZpZGVvcyUyMCUzRCUyMHBpcGUocHJvbXB0KS5mcmFtZXMlNUIwJTVEJTBBZXhwb3J0X3RvX2dpZih2aWRlb3MlMkMlMjAlMjJsYXR0ZS5naWYlMjIp",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> LattePipeline
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_gif
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># You can replace the checkpoint id with &quot;maxin-cn/Latte-1&quot; too.</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>pipe = LattePipeline.from_pretrained(<span class="hljs-string">&quot;maxin-cn/Latte-1&quot;</span>, torch_dtype=torch.float16)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># Enable memory optimizations.</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>pipe.enable_model_cpu_offload()
<span class="hljs-meta">&gt;&gt;&gt; </span>prompt = <span class="hljs-string">&quot;A small cactus with a happy face in the Sahara desert.&quot;</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>videos = pipe(prompt).frames[<span class="hljs-number">0</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>export_to_gif(videos, <span class="hljs-string">&quot;latte.gif&quot;</span>)`,wrap:!1}}),{c(){i=r("p"),i.textContent=J,c=a(),m(v.$$.fragment)},l(p){i=l(p,"P",{"data-svelte-h":!0}),d(i)!=="svelte-kvfsh7"&&(i.textContent=J),c=s(p),f(v.$$.fragment,p)},m(p,M){o(p,i,M),o(p,c,M),u(v,p,M),w=!0},p:rt,i(p){w||(h(v.$$.fragment,p),w=!0)},o(p){g(v.$$.fragment,p),w=!1},d(p){p&&(n(i),n(c)),_(v,p)}}}function bt(ee){let i,J,c,v,w,p,M,He='<img src="https://github.com/Vchitect/Latte/blob/52bc0029899babbd6e9250384c83d8ed2670ff7a/visuals/latte.gif?raw=true" alt="latte text-to-video"/>',pe,j,Ve='<a href="https://huggingface.co/papers/2401.03048" rel="nofollow">Latte: Latent Diffusion Transformer for Video Generation</a> from Monash University, Shanghai AI Lab, Nanjing University, and Nanyang Technological University.',de,k,qe="The abstract from the paper is:",ce,I,Ee="<em>We propose a novel Latent Diffusion Transformer, namely Latte, for video generation. Latte first extracts spatio-temporal tokens from input videos and then adopts a series of Transformer blocks to model video distribution in the latent space. In order to model a substantial number of tokens extracted from videos, four efficient variants are introduced from the perspective of decomposing the spatial and temporal dimensions of input videos. To improve the quality of generated videos, we determine the best practices of Latte through rigorous experimental analysis, including video clip patch embedding, model variants, timestep-class information injection, temporal positional embedding, and learning strategies. Our comprehensive evaluation demonstrates that Latte achieves state-of-the-art performance across four standard video generation datasets, i.e., FaceForensics, SkyTimelapse, UCF101, and Taichi-HD. In addition, we extend Latte to text-to-video generation (T2V) task, where Latte achieves comparable results compared to recent T2V models. We strongly believe that Latte provides valuable insights for future research on incorporating Transformers into diffusion models for video generation.</em>",me,B,Ye='<strong>Highlights</strong>: Latte is a latent diffusion transformer proposed as a backbone for modeling different modalities (trained for text-to-video generation here). It achieves state-of-the-art performance across four standard video benchmarks - <a href="https://huggingface.co/papers/1803.09179" rel="nofollow">FaceForensics</a>, <a href="https://huggingface.co/papers/1709.07592" rel="nofollow">SkyTimelapse</a>, <a href="https://huggingface.co/papers/1212.0402" rel="nofollow">UCF101</a> and <a href="https://huggingface.co/papers/2003.00196" rel="nofollow">Taichi-HD</a>. To prepare and download the datasets for evaluation, please refer to <a href="https://github.com/Vchitect/Latte/blob/main/docs/datasets_evaluation.md" rel="nofollow">this https URL</a>.',fe,C,Se='This pipeline was contributed by <a href="https://github.com/maxin-cn" rel="nofollow">maxin-cn</a>. The original codebase can be found <a href="https://github.com/Vchitect/Latte" rel="nofollow">here</a>. The original weights can be found under <a href="https://huggingface.co/maxin-cn" rel="nofollow">hf.co/maxin-cn</a>.',ue,U,he,G,ge,W,De='Use <a href="https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile" rel="nofollow"><code>torch.compile</code></a> to reduce the inference latency.',_e,P,Ae="First, load the pipeline:",be,X,ve,F,Ke="Then change the memory layout of the pipelines <code>transformer</code> and <code>vae</code> components to <code>torch.channels-last</code>:",Te,N,ye,R,Oe="Finally, compile the components and run inference:",we,z,Me,Q,et='The <a href="https://gist.github.com/a-r-r-o-w/4e1694ca46374793c0361d740a99ff19" rel="nofollow">benchmark</a> results on an 80GB A100 machine are:',Je,H,xe,V,Le,q,tt="Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.",Ue,E,nt='Refer to the <a href="../../quantization/overview">Quantization</a> overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized <a href="/docs/diffusers/pr_12229/en/api/pipelines/latte#diffusers.LattePipeline">LattePipeline</a> for inference with bitsandbytes.',Ze,Y,$e,S,je,b,D,We,te,ot="Pipeline for text-to-video generation using Latte.",Pe,ne,at=`This model inherits from <a href="/docs/diffusers/pr_12229/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a>. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)`,Xe,x,A,Fe,oe,st="Function invoked when calling the pipeline for generation.",Ne,Z,Re,$,K,ze,ae,it="Encodes the prompt into text encoder hidden states.",ke,O,Ie,ie,Be;return w=new Ge({props:{title:"Latte",local:"latte",headingTag:"h1"}}),U=new ft({props:{$$slots:{default:[gt]},$$scope:{ctx:ee}}}),G=new Ge({props:{title:"Inference",local:"inference",headingTag:"h3"}}),X=new se({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwTGF0dGVQaXBlbGluZSUwQSUwQXBpcGVsaW5lJTIwJTNEJTIwTGF0dGVQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTA5JTIybWF4aW4tY24lMkZMYXR0ZS0xJTIyJTJDJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2JTBBKS50byglMjJjdWRhJTIyKQ==",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> LattePipeline
pipeline = LattePipeline.from_pretrained(
<span class="hljs-string">&quot;maxin-cn/Latte-1&quot;</span>, torch_dtype=torch.float16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)`,wrap:!1}}),N=new se({props:{code:"cGlwZWxpbmUudHJhbnNmb3JtZXIudG8obWVtb3J5X2Zvcm1hdCUzRHRvcmNoLmNoYW5uZWxzX2xhc3QpJTBBcGlwZWxpbmUudmFlLnRvKG1lbW9yeV9mb3JtYXQlM0R0b3JjaC5jaGFubmVsc19sYXN0KQ==",highlighted:`pipeline.transformer.to(memory_format=torch.channels_last)
pipeline.vae.to(memory_format=torch.channels_last)`,wrap:!1}}),z=new se({props:{code:"cGlwZWxpbmUudHJhbnNmb3JtZXIlMjAlM0QlMjB0b3JjaC5jb21waWxlKHBpcGVsaW5lLnRyYW5zZm9ybWVyKSUwQXBpcGVsaW5lLnZhZS5kZWNvZGUlMjAlM0QlMjB0b3JjaC5jb21waWxlKHBpcGVsaW5lLnZhZS5kZWNvZGUpJTBBJTBBdmlkZW8lMjAlM0QlMjBwaXBlbGluZShwcm9tcHQlM0QlMjJBJTIwZG9nJTIwd2VhcmluZyUyMHN1bmdsYXNzZXMlMjBmbG9hdGluZyUyMGluJTIwc3BhY2UlMkMlMjBzdXJyZWFsJTJDJTIwbmVidWxhZSUyMGluJTIwYmFja2dyb3VuZCUyMikuZnJhbWVzJTVCMCU1RA==",highlighted:`pipeline.transformer = torch.<span class="hljs-built_in">compile</span>(pipeline.transformer)
pipeline.vae.decode = torch.<span class="hljs-built_in">compile</span>(pipeline.vae.decode)
video = pipeline(prompt=<span class="hljs-string">&quot;A dog wearing sunglasses floating in space, surreal, nebulae in background&quot;</span>).frames[<span class="hljs-number">0</span>]`,wrap:!1}}),H=new se({props:{code:"V2l0aG91dCUyMHRvcmNoLmNvbXBpbGUoKSUzQSUyMEF2ZXJhZ2UlMjBpbmZlcmVuY2UlMjB0aW1lJTNBJTIwMTYuMjQ2JTIwc2Vjb25kcy4lMEFXaXRoJTIwdG9yY2guY29tcGlsZSgpJTNBJTIwQXZlcmFnZSUyMGluZmVyZW5jZSUyMHRpbWUlM0ElMjAxNC41NzMlMjBzZWNvbmRzLg==",highlighted:`<span class="hljs-attribute">Without</span> torch.compile(): Average inference time: <span class="hljs-number">16</span>.<span class="hljs-number">246</span> seconds.
<span class="hljs-attribute">With</span> torch.compile(): Average inference time: <span class="hljs-number">14</span>.<span class="hljs-number">573</span> seconds.`,wrap:!1}}),V=new Ge({props:{title:"Quantization",local:"quantization",headingTag:"h2"}}),Y=new se({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwQml0c0FuZEJ5dGVzQ29uZmlnJTIwYXMlMjBEaWZmdXNlcnNCaXRzQW5kQnl0ZXNDb25maWclMkMlMjBMYXR0ZVRyYW5zZm9ybWVyM0RNb2RlbCUyQyUyMExhdHRlUGlwZWxpbmUlMEFmcm9tJTIwZGlmZnVzZXJzLnV0aWxzJTIwaW1wb3J0JTIwZXhwb3J0X3RvX2dpZiUwQWZyb20lMjB0cmFuc2Zvcm1lcnMlMjBpbXBvcnQlMjBCaXRzQW5kQnl0ZXNDb25maWclMjBhcyUyMEJpdHNBbmRCeXRlc0NvbmZpZyUyQyUyMFQ1RW5jb2Rlck1vZGVsJTBBJTBBcXVhbnRfY29uZmlnJTIwJTNEJTIwQml0c0FuZEJ5dGVzQ29uZmlnKGxvYWRfaW5fOGJpdCUzRFRydWUpJTBBdGV4dF9lbmNvZGVyXzhiaXQlMjAlM0QlMjBUNUVuY29kZXJNb2RlbC5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIybWF4aW4tY24lMkZMYXR0ZS0xJTIyJTJDJTBBJTIwJTIwJTIwJTIwc3ViZm9sZGVyJTNEJTIydGV4dF9lbmNvZGVyJTIyJTJDJTBBJTIwJTIwJTIwJTIwcXVhbnRpemF0aW9uX2NvbmZpZyUzRHF1YW50X2NvbmZpZyUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSklMEElMEFxdWFudF9jb25maWclMjAlM0QlMjBEaWZmdXNlcnNCaXRzQW5kQnl0ZXNDb25maWcobG9hZF9pbl84Yml0JTNEVHJ1ZSklMEF0cmFuc2Zvcm1lcl84Yml0JTIwJTNEJTIwTGF0dGVUcmFuc2Zvcm1lcjNETW9kZWwuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMm1heGluLWNuJTJGTGF0dGUtMSUyMiUyQyUwQSUyMCUyMCUyMCUyMHN1YmZvbGRlciUzRCUyMnRyYW5zZm9ybWVyJTIyJTJDJTBBJTIwJTIwJTIwJTIwcXVhbnRpemF0aW9uX2NvbmZpZyUzRHF1YW50X2NvbmZpZyUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSklMEElMEFwaXBlbGluZSUyMCUzRCUyMExhdHRlUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMm1heGluLWNuJTJGTGF0dGUtMSUyMiUyQyUwQSUyMCUyMCUyMCUyMHRleHRfZW5jb2RlciUzRHRleHRfZW5jb2Rlcl84Yml0JTJDJTBBJTIwJTIwJTIwJTIwdHJhbnNmb3JtZXIlM0R0cmFuc2Zvcm1lcl84Yml0JTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2JTJDJTBBJTIwJTIwJTIwJTIwZGV2aWNlX21hcCUzRCUyMmJhbGFuY2VkJTIyJTJDJTBBKSUwQSUwQXByb21wdCUyMCUzRCUyMCUyMkElMjBzbWFsbCUyMGNhY3R1cyUyMHdpdGglMjBhJTIwaGFwcHklMjBmYWNlJTIwaW4lMjB0aGUlMjBTYWhhcmElMjBkZXNlcnQuJTIyJTBBdmlkZW8lMjAlM0QlMjBwaXBlbGluZShwcm9tcHQpLmZyYW1lcyU1QjAlNUQlMEFleHBvcnRfdG9fZ2lmKHZpZGVvJTJDJTIwJTIybGF0dGUuZ2lmJTIyKQ==",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> BitsAndBytesConfig <span class="hljs-keyword">as</span> DiffusersBitsAndBytesConfig, LatteTransformer3DModel, LattePipeline
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_gif
<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> BitsAndBytesConfig <span class="hljs-keyword">as</span> BitsAndBytesConfig, T5EncoderModel
quant_config = BitsAndBytesConfig(load_in_8bit=<span class="hljs-literal">True</span>)
text_encoder_8bit = T5EncoderModel.from_pretrained(
<span class="hljs-string">&quot;maxin-cn/Latte-1&quot;</span>,
subfolder=<span class="hljs-string">&quot;text_encoder&quot;</span>,
quantization_config=quant_config,
torch_dtype=torch.float16,
)
quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=<span class="hljs-literal">True</span>)
transformer_8bit = LatteTransformer3DModel.from_pretrained(
<span class="hljs-string">&quot;maxin-cn/Latte-1&quot;</span>,
subfolder=<span class="hljs-string">&quot;transformer&quot;</span>,
quantization_config=quant_config,
torch_dtype=torch.float16,
)
pipeline = LattePipeline.from_pretrained(
<span class="hljs-string">&quot;maxin-cn/Latte-1&quot;</span>,
text_encoder=text_encoder_8bit,
transformer=transformer_8bit,
torch_dtype=torch.float16,
device_map=<span class="hljs-string">&quot;balanced&quot;</span>,
)
prompt = <span class="hljs-string">&quot;A small cactus with a happy face in the Sahara desert.&quot;</span>
video = pipeline(prompt).frames[<span class="hljs-number">0</span>]
export_to_gif(video, <span class="hljs-string">&quot;latte.gif&quot;</span>)`,wrap:!1}}),S=new Ge({props:{title:"LattePipeline",local:"diffusers.LattePipeline",headingTag:"h2"}}),D=new Qe({props:{name:"class diffusers.LattePipeline",anchor:"diffusers.LattePipeline",parameters:[{name:"tokenizer",val:": T5Tokenizer"},{name:"text_encoder",val:": T5EncoderModel"},{name:"vae",val:": AutoencoderKL"},{name:"transformer",val:": LatteTransformer3DModel"},{name:"scheduler",val:": KarrasDiffusionSchedulers"}],parametersDescription:[{anchor:"diffusers.LattePipeline.vae",description:`<strong>vae</strong> (<a href="/docs/diffusers/pr_12229/en/api/models/autoencoderkl#diffusers.AutoencoderKL">AutoencoderKL</a>) &#x2014;
Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.`,name:"vae"},{anchor:"diffusers.LattePipeline.text_encoder",description:`<strong>text_encoder</strong> (<code>T5EncoderModel</code>) &#x2014;
Frozen text-encoder. Latte uses
<a href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel" rel="nofollow">T5</a>, specifically the
<a href="https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl" rel="nofollow">t5-v1_1-xxl</a> variant.`,name:"text_encoder"},{anchor:"diffusers.LattePipeline.tokenizer",description:`<strong>tokenizer</strong> (<code>T5Tokenizer</code>) &#x2014;
Tokenizer of class
<a href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer" rel="nofollow">T5Tokenizer</a>.`,name:"tokenizer"},{anchor:"diffusers.LattePipeline.transformer",description:`<strong>transformer</strong> (<a href="/docs/diffusers/pr_12229/en/api/models/latte_transformer3d#diffusers.LatteTransformer3DModel">LatteTransformer3DModel</a>) &#x2014;
A text conditioned <code>LatteTransformer3DModel</code> to denoise the encoded video latents.`,name:"transformer"},{anchor:"diffusers.LattePipeline.scheduler",description:`<strong>scheduler</strong> (<a href="/docs/diffusers/pr_12229/en/api/schedulers/overview#diffusers.SchedulerMixin">SchedulerMixin</a>) &#x2014;
A scheduler to be used in combination with <code>transformer</code> to denoise the encoded video latents.`,name:"scheduler"}],source:"https://github.com/huggingface/diffusers/blob/vr_12229/src/diffusers/pipelines/latte/pipeline_latte.py#L145"}}),A=new Qe({props:{name:"__call__",anchor:"diffusers.LattePipeline.__call__",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]] = None"},{name:"negative_prompt",val:": str = ''"},{name:"num_inference_steps",val:": int = 50"},{name:"timesteps",val:": typing.Optional[typing.List[int]] = None"},{name:"guidance_scale",val:": float = 7.5"},{name:"num_images_per_prompt",val:": int = 1"},{name:"video_length",val:": int = 16"},{name:"height",val:": int = 512"},{name:"width",val:": int = 512"},{name:"eta",val:": float = 0.0"},{name:"generator",val:": typing.Union[torch._C.Generator, typing.List[torch._C.Generator], NoneType] = None"},{name:"latents",val:": typing.Optional[torch.FloatTensor] = None"},{name:"prompt_embeds",val:": typing.Optional[torch.FloatTensor] = None"},{name:"negative_prompt_embeds",val:": typing.Optional[torch.FloatTensor] = None"},{name:"output_type",val:": str = 'pil'"},{name:"return_dict",val:": bool = True"},{name:"callback_on_step_end",val:": typing.Union[typing.Callable[[int, int, typing.Dict], NoneType], diffusers.callbacks.PipelineCallback, diffusers.callbacks.MultiPipelineCallbacks, NoneType] = None"},{name:"callback_on_step_end_tensor_inputs",val:": typing.List[str] = ['latents']"},{name:"clean_caption",val:": bool = True"},{name:"mask_feature",val:": bool = True"},{name:"enable_temporal_attentions",val:": bool = True"},{name:"decode_chunk_size",val:": int = 14"}],parametersDescription:[{anchor:"diffusers.LattePipeline.__call__.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) &#x2014;
The prompt or prompts to guide the video generation. If not defined, one has to pass <code>prompt_embeds</code>.
instead.`,name:"prompt"},{anchor:"diffusers.LattePipeline.__call__.negative_prompt",description:`<strong>negative_prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) &#x2014;
The prompt or prompts not to guide the video generation. If not defined, one has to pass
<code>negative_prompt_embeds</code> instead. Ignored when not using guidance (i.e., ignored if <code>guidance_scale</code> is
less than <code>1</code>).`,name:"negative_prompt"},{anchor:"diffusers.LattePipeline.__call__.num_inference_steps",description:`<strong>num_inference_steps</strong> (<code>int</code>, <em>optional</em>, defaults to 100) &#x2014;
The number of denoising steps. More denoising steps usually lead to a higher quality video at the
expense of slower inference.`,name:"num_inference_steps"},{anchor:"diffusers.LattePipeline.__call__.timesteps",description:`<strong>timesteps</strong> (<code>List[int]</code>, <em>optional</em>) &#x2014;
Custom timesteps to use for the denoising process. If not defined, equal spaced <code>num_inference_steps</code>
timesteps are used. Must be in descending order.`,name:"timesteps"},{anchor:"diffusers.LattePipeline.__call__.guidance_scale",description:`<strong>guidance_scale</strong> (<code>float</code>, <em>optional</em>, defaults to 7.0) &#x2014;
Guidance scale as defined in <a href="https://huggingface.co/papers/2207.12598" rel="nofollow">Classifier-Free Diffusion
Guidance</a>. <code>guidance_scale</code> is defined as <code>w</code> of equation 2.
of <a href="https://huggingface.co/papers/2205.11487" rel="nofollow">Imagen Paper</a>. Guidance scale is enabled by setting
<code>guidance_scale &gt; 1</code>. Higher guidance scale encourages to generate videos that are closely linked to
the text <code>prompt</code>, usually at the expense of lower video quality.`,name:"guidance_scale"},{anchor:"diffusers.LattePipeline.__call__.video_length",description:`<strong>video_length</strong> (<code>int</code>, <em>optional</em>, defaults to 16) &#x2014;
The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds`,name:"video_length"},{anchor:"diffusers.LattePipeline.__call__.num_images_per_prompt",description:`<strong>num_images_per_prompt</strong> (<code>int</code>, <em>optional</em>, defaults to 1) &#x2014;
The number of videos to generate per prompt.`,name:"num_images_per_prompt"},{anchor:"diffusers.LattePipeline.__call__.height",description:`<strong>height</strong> (<code>int</code>, <em>optional</em>, defaults to self.unet.config.sample_size) &#x2014;
The height in pixels of the generated video.`,name:"height"},{anchor:"diffusers.LattePipeline.__call__.width",description:`<strong>width</strong> (<code>int</code>, <em>optional</em>, defaults to self.unet.config.sample_size) &#x2014;
The width in pixels of the generated video.`,name:"width"},{anchor:"diffusers.LattePipeline.__call__.eta",description:`<strong>eta</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) &#x2014;
Corresponds to parameter eta (&#x3B7;) in the DDIM paper: <a href="https://huggingface.co/papers/2010.02502" rel="nofollow">https://huggingface.co/papers/2010.02502</a>. Only
applies to <a href="/docs/diffusers/pr_12229/en/api/schedulers/ddim#diffusers.DDIMScheduler">schedulers.DDIMScheduler</a>, will be ignored for others.`,name:"eta"},{anchor:"diffusers.LattePipeline.__call__.generator",description:`<strong>generator</strong> (<code>torch.Generator</code> or <code>List[torch.Generator]</code>, <em>optional</em>) &#x2014;
One or a list of <a href="https://pytorch.org/docs/stable/generated/torch.Generator.html" rel="nofollow">torch generator(s)</a>
to make generation deterministic.`,name:"generator"},{anchor:"diffusers.LattePipeline.__call__.latents",description:`<strong>latents</strong> (<code>torch.FloatTensor</code>, <em>optional</em>) &#x2014;
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will be generated by sampling using the supplied random <code>generator</code>.`,name:"latents"},{anchor:"diffusers.LattePipeline.__call__.prompt_embeds",description:`<strong>prompt_embeds</strong> (<code>torch.FloatTensor</code>, <em>optional</em>) &#x2014;
Pre-generated text embeddings. Can be used to easily tweak text inputs, <em>e.g.</em> prompt weighting. If not
provided, text embeddings will be generated from <code>prompt</code> input argument.`,name:"prompt_embeds"},{anchor:"diffusers.LattePipeline.__call__.negative_prompt_embeds",description:`<strong>negative_prompt_embeds</strong> (<code>torch.FloatTensor</code>, <em>optional</em>) &#x2014;
Pre-generated negative text embeddings. For Latte this negative prompt should be &quot;&quot;. If not provided,
negative_prompt_embeds will be generated from <code>negative_prompt</code> input argument.`,name:"negative_prompt_embeds"},{anchor:"diffusers.LattePipeline.__call__.output_type",description:`<strong>output_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;pil&quot;</code>) &#x2014;
The output format of the generate video. Choose between
<a href="https://pillow.readthedocs.io/en/stable/" rel="nofollow">PIL</a>: <code>PIL.Image.Image</code> or <code>np.array</code>.`,name:"output_type"},{anchor:"diffusers.LattePipeline.__call__.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) &#x2014;
Whether or not to return a <code>~pipelines.stable_diffusion.IFPipelineOutput</code> instead of a plain tuple.`,name:"return_dict"},{anchor:"diffusers.LattePipeline.__call__.callback_on_step_end",description:`<strong>callback_on_step_end</strong> (<code>Callable[[int, int, Dict], None]</code>, <code>PipelineCallback</code>, <code>MultiPipelineCallbacks</code>, <em>optional</em>) &#x2014;
A callback function or a list of callback functions to be called at the end of each denoising step.`,name:"callback_on_step_end"},{anchor:"diffusers.LattePipeline.__call__.callback_on_step_end_tensor_inputs",description:`<strong>callback_on_step_end_tensor_inputs</strong> (<code>List[str]</code>, <em>optional</em>) &#x2014;
A list of tensor inputs that should be passed to the callback function. If not defined, all tensor
inputs will be passed.`,name:"callback_on_step_end_tensor_inputs"},{anchor:"diffusers.LattePipeline.__call__.clean_caption",description:`<strong>clean_caption</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) &#x2014;
Whether or not to clean the caption before creating embeddings. Requires <code>beautifulsoup4</code> and <code>ftfy</code> to
be installed. If the dependencies are not installed, the embeddings will be created from the raw
prompt.`,name:"clean_caption"},{anchor:"diffusers.LattePipeline.__call__.mask_feature",description:"<strong>mask_feature</strong> (<code>bool</code> defaults to <code>True</code>) &#x2014; If set to <code>True</code>, the text embeddings will be masked.",name:"mask_feature"},{anchor:"diffusers.LattePipeline.__call__.enable_temporal_attentions",description:"<strong>enable_temporal_attentions</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) &#x2014; Whether to enable temporal attentions",name:"enable_temporal_attentions"},{anchor:"diffusers.LattePipeline.__call__.decode_chunk_size",description:`<strong>decode_chunk_size</strong> (<code>int</code>, <em>optional</em>) &#x2014;
The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
For lower memory usage, reduce <code>decode_chunk_size</code>.`,name:"decode_chunk_size"}],source:"https://github.com/huggingface/diffusers/blob/vr_12229/src/diffusers/pipelines/latte/pipeline_latte.py#L613",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>
<p>If <code>return_dict</code> is <code>True</code>, <code>LattePipelineOutput</code> is returned,
otherwise a <code>tuple</code> is returned where the first element is a list with the generated images</p>
`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>
<p><code>LattePipelineOutput</code> or <code>tuple</code></p>
`}}),Z=new ut({props:{anchor:"diffusers.LattePipeline.__call__.example",$$slots:{default:[_t]},$$scope:{ctx:ee}}}),K=new Qe({props:{name:"encode_prompt",anchor:"diffusers.LattePipeline.encode_prompt",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]]"},{name:"do_classifier_free_guidance",val:": bool = True"},{name:"negative_prompt",val:": str = ''"},{name:"num_images_per_prompt",val:": int = 1"},{name:"device",val:": typing.Optional[torch.device] = None"},{name:"prompt_embeds",val:": typing.Optional[torch.FloatTensor] = None"},{name:"negative_prompt_embeds",val:": typing.Optional[torch.FloatTensor] = None"},{name:"clean_caption",val:": bool = False"},{name:"mask_feature",val:": bool = True"},{name:"dtype",val:" = None"}],parametersDescription:[{anchor:"diffusers.LattePipeline.encode_prompt.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) &#x2014;
prompt to be encoded`,name:"prompt"},{anchor:"diffusers.LattePipeline.encode_prompt.negative_prompt",description:`<strong>negative_prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) &#x2014;
The prompt not to guide the video generation. If not defined, one has to pass <code>negative_prompt_embeds</code>
instead. Ignored when not using guidance (i.e., ignored if <code>guidance_scale</code> is less than <code>1</code>). For
Latte, this should be &quot;&quot;.`,name:"negative_prompt"},{anchor:"diffusers.LattePipeline.encode_prompt.do_classifier_free_guidance",description:`<strong>do_classifier_free_guidance</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) &#x2014;
whether to use classifier free guidance or not`,name:"do_classifier_free_guidance"},{anchor:"diffusers.LattePipeline.encode_prompt.num_images_per_prompt",description:`<strong>num_images_per_prompt</strong> (<code>int</code>, <em>optional</em>, defaults to 1) &#x2014;
number of video that should be generated per prompt`,name:"num_images_per_prompt"},{anchor:"diffusers.LattePipeline.encode_prompt.device",description:`<strong>device</strong> &#x2014; (<code>torch.device</code>, <em>optional</em>):
torch device to place the resulting embeddings on`,name:"device"},{anchor:"diffusers.LattePipeline.encode_prompt.prompt_embeds",description:`<strong>prompt_embeds</strong> (<code>torch.FloatTensor</code>, <em>optional</em>) &#x2014;
Pre-generated text embeddings. Can be used to easily tweak text inputs, <em>e.g.</em> prompt weighting. If not
provided, text embeddings will be generated from <code>prompt</code> input argument.`,name:"prompt_embeds"},{anchor:"diffusers.LattePipeline.encode_prompt.negative_prompt_embeds",description:`<strong>negative_prompt_embeds</strong> (<code>torch.FloatTensor</code>, <em>optional</em>) &#x2014;
Pre-generated negative text embeddings. For Latte, it&#x2019;s should be the embeddings of the &quot;&quot; string.`,name:"negative_prompt_embeds"},{anchor:"diffusers.LattePipeline.encode_prompt.clean_caption",description:`<strong>clean_caption</strong> (bool, defaults to <code>False</code>) &#x2014;
If <code>True</code>, the function will preprocess and clean the provided caption before encoding.`,name:"clean_caption"},{anchor:"diffusers.LattePipeline.encode_prompt.mask_feature",description:`<strong>mask_feature</strong> &#x2014; (bool, defaults to <code>True</code>):
If <code>True</code>, the function will mask the text embeddings.`,name:"mask_feature"}],source:"https://github.com/huggingface/diffusers/blob/vr_12229/src/diffusers/pipelines/latte/pipeline_latte.py#L206"}}),O=new ht({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/latte.md"}}),{c(){i=r("meta"),J=a(),c=r("p"),v=a(),m(w.$$.fragment),p=a(),M=r("p"),M.innerHTML=He,pe=a(),j=r("p"),j.innerHTML=Ve,de=a(),k=r("p"),k.textContent=qe,ce=a(),I=r("p"),I.innerHTML=Ee,me=a(),B=r("p"),B.innerHTML=Ye,fe=a(),C=r("p"),C.innerHTML=Se,ue=a(),m(U.$$.fragment),he=a(),m(G.$$.fragment),ge=a(),W=r("p"),W.innerHTML=De,_e=a(),P=r("p"),P.textContent=Ae,be=a(),m(X.$$.fragment),ve=a(),F=r("p"),F.innerHTML=Ke,Te=a(),m(N.$$.fragment),ye=a(),R=r("p"),R.textContent=Oe,we=a(),m(z.$$.fragment),Me=a(),Q=r("p"),Q.innerHTML=et,Je=a(),m(H.$$.fragment),xe=a(),m(V.$$.fragment),Le=a(),q=r("p"),q.textContent=tt,Ue=a(),E=r("p"),E.innerHTML=nt,Ze=a(),m(Y.$$.fragment),$e=a(),m(S.$$.fragment),je=a(),b=r("div"),m(D.$$.fragment),We=a(),te=r("p"),te.textContent=ot,Pe=a(),ne=r("p"),ne.innerHTML=at,Xe=a(),x=r("div"),m(A.$$.fragment),Fe=a(),oe=r("p"),oe.textContent=st,Ne=a(),m(Z.$$.fragment),Re=a(),$=r("div"),m(K.$$.fragment),ze=a(),ae=r("p"),ae.textContent=it,ke=a(),m(O.$$.fragment),Ie=a(),ie=r("p"),this.h()},l(e){const t=mt("svelte-u9bgzb",document.head);i=l(t,"META",{name:!0,content:!0}),t.forEach(n),J=s(e),c=l(e,"P",{}),re(c).forEach(n),v=s(e),f(w.$$.fragment,e),p=s(e),M=l(e,"P",{"data-svelte-h":!0}),d(M)!=="svelte-1bd7mp4"&&(M.innerHTML=He),pe=s(e),j=l(e,"P",{"data-svelte-h":!0}),d(j)!=="svelte-qqql9f"&&(j.innerHTML=Ve),de=s(e),k=l(e,"P",{"data-svelte-h":!0}),d(k)!=="svelte-1cwsb16"&&(k.textContent=qe),ce=s(e),I=l(e,"P",{"data-svelte-h":!0}),d(I)!=="svelte-12ap5po"&&(I.innerHTML=Ee),me=s(e),B=l(e,"P",{"data-svelte-h":!0}),d(B)!=="svelte-1grsw2n"&&(B.innerHTML=Ye),fe=s(e),C=l(e,"P",{"data-svelte-h":!0}),d(C)!=="svelte-66m9gt"&&(C.innerHTML=Se),ue=s(e),f(U.$$.fragment,e),he=s(e),f(G.$$.fragment,e),ge=s(e),W=l(e,"P",{"data-svelte-h":!0}),d(W)!=="svelte-iekg51"&&(W.innerHTML=De),_e=s(e),P=l(e,"P",{"data-svelte-h":!0}),d(P)!=="svelte-jub7f1"&&(P.textContent=Ae),be=s(e),f(X.$$.fragment,e),ve=s(e),F=l(e,"P",{"data-svelte-h":!0}),d(F)!=="svelte-4294wb"&&(F.innerHTML=Ke),Te=s(e),f(N.$$.fragment,e),ye=s(e),R=l(e,"P",{"data-svelte-h":!0}),d(R)!=="svelte-9i4prs"&&(R.textContent=Oe),we=s(e),f(z.$$.fragment,e),Me=s(e),Q=l(e,"P",{"data-svelte-h":!0}),d(Q)!=="svelte-xadg5n"&&(Q.innerHTML=et),Je=s(e),f(H.$$.fragment,e),xe=s(e),f(V.$$.fragment,e),Le=s(e),q=l(e,"P",{"data-svelte-h":!0}),d(q)!=="svelte-1ou2pxc"&&(q.textContent=tt),Ue=s(e),E=l(e,"P",{"data-svelte-h":!0}),d(E)!=="svelte-14po9eo"&&(E.innerHTML=nt),Ze=s(e),f(Y.$$.fragment,e),$e=s(e),f(S.$$.fragment,e),je=s(e),b=l(e,"DIV",{class:!0});var y=re(b);f(D.$$.fragment,y),We=s(y),te=l(y,"P",{"data-svelte-h":!0}),d(te)!=="svelte-5dgzru"&&(te.textContent=ot),Pe=s(y),ne=l(y,"P",{"data-svelte-h":!0}),d(ne)!=="svelte-rvpa7"&&(ne.innerHTML=at),Xe=s(y),x=l(y,"DIV",{class:!0});var L=re(x);f(A.$$.fragment,L),Fe=s(L),oe=l(L,"P",{"data-svelte-h":!0}),d(oe)!=="svelte-v78lg8"&&(oe.textContent=st),Ne=s(L),f(Z.$$.fragment,L),L.forEach(n),Re=s(y),$=l(y,"DIV",{class:!0});var Ce=re($);f(K.$$.fragment,Ce),ze=s(Ce),ae=l(Ce,"P",{"data-svelte-h":!0}),d(ae)!=="svelte-16q0ax1"&&(ae.textContent=it),Ce.forEach(n),y.forEach(n),ke=s(e),f(O.$$.fragment,e),Ie=s(e),ie=l(e,"P",{}),re(ie).forEach(n),this.h()},h(){le(i,"name","hf:doc:metadata"),le(i,"content",vt),le(x,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),le($,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),le(b,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,t){T(document.head,i),o(e,J,t),o(e,c,t),o(e,v,t),u(w,e,t),o(e,p,t),o(e,M,t),o(e,pe,t),o(e,j,t),o(e,de,t),o(e,k,t),o(e,ce,t),o(e,I,t),o(e,me,t),o(e,B,t),o(e,fe,t),o(e,C,t),o(e,ue,t),u(U,e,t),o(e,he,t),u(G,e,t),o(e,ge,t),o(e,W,t),o(e,_e,t),o(e,P,t),o(e,be,t),u(X,e,t),o(e,ve,t),o(e,F,t),o(e,Te,t),u(N,e,t),o(e,ye,t),o(e,R,t),o(e,we,t),u(z,e,t),o(e,Me,t),o(e,Q,t),o(e,Je,t),u(H,e,t),o(e,xe,t),u(V,e,t),o(e,Le,t),o(e,q,t),o(e,Ue,t),o(e,E,t),o(e,Ze,t),u(Y,e,t),o(e,$e,t),u(S,e,t),o(e,je,t),o(e,b,t),u(D,b,null),T(b,We),T(b,te),T(b,Pe),T(b,ne),T(b,Xe),T(b,x),u(A,x,null),T(x,Fe),T(x,oe),T(x,Ne),u(Z,x,null),T(b,Re),T(b,$),u(K,$,null),T($,ze),T($,ae),o(e,ke,t),u(O,e,t),o(e,Ie,t),o(e,ie,t),Be=!0},p(e,[t]){const y={};t&2&&(y.$$scope={dirty:t,ctx:e}),U.$set(y);const L={};t&2&&(L.$$scope={dirty:t,ctx:e}),Z.$set(L)},i(e){Be||(h(w.$$.fragment,e),h(U.$$.fragment,e),h(G.$$.fragment,e),h(X.$$.fragment,e),h(N.$$.fragment,e),h(z.$$.fragment,e),h(H.$$.fragment,e),h(V.$$.fragment,e),h(Y.$$.fragment,e),h(S.$$.fragment,e),h(D.$$.fragment,e),h(A.$$.fragment,e),h(Z.$$.fragment,e),h(K.$$.fragment,e),h(O.$$.fragment,e),Be=!0)},o(e){g(w.$$.fragment,e),g(U.$$.fragment,e),g(G.$$.fragment,e),g(X.$$.fragment,e),g(N.$$.fragment,e),g(z.$$.fragment,e),g(H.$$.fragment,e),g(V.$$.fragment,e),g(Y.$$.fragment,e),g(S.$$.fragment,e),g(D.$$.fragment,e),g(A.$$.fragment,e),g(Z.$$.fragment,e),g(K.$$.fragment,e),g(O.$$.fragment,e),Be=!1},d(e){e&&(n(J),n(c),n(v),n(p),n(M),n(pe),n(j),n(de),n(k),n(ce),n(I),n(me),n(B),n(fe),n(C),n(ue),n(he),n(ge),n(W),n(_e),n(P),n(be),n(ve),n(F),n(Te),n(ye),n(R),n(we),n(Me),n(Q),n(Je),n(xe),n(Le),n(q),n(Ue),n(E),n(Ze),n($e),n(je),n(b),n(ke),n(Ie),n(ie)),n(i),_(w,e),_(U,e),_(G,e),_(X,e),_(N,e),_(z,e),_(H,e),_(V,e),_(Y,e),_(S,e),_(D),_(A),_(Z),_(K),_(O,e)}}}const vt='{"title":"Latte","local":"latte","sections":[{"title":"Inference","local":"inference","sections":[],"depth":3},{"title":"Quantization","local":"quantization","sections":[],"depth":2},{"title":"LattePipeline","local":"diffusers.LattePipeline","sections":[],"depth":2}],"depth":1}';function Tt(ee){return pt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Zt extends dt{constructor(i){super(),ct(this,i,Tt,bt,lt,{})}}export{Zt as component};

Xet Storage Details

Size:
36.4 kB
·
Xet hash:
e83e9d17f90bc906b98f29ef3a142042e853dc8e8784b9588ffc2f193f7c83b1

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.