Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / diffusers /pr_11335 /en /_app /immutable /nodes /110.b98eaebf.js

rtrm's picture

29 days ago

31.1 kB

	import{s as Se,o as Ee,n as Ne}from"../chunks/scheduler.8c3d61f6.js";import{S as He,i as Re,g as r,s,r as _,A as Oe,h as l,f as n,c as o,j as K,u as v,x as g,k as ee,y as d,a as i,v as b,d as y,t as I,w}from"../chunks/index.da70eac4.js";import{T as Fe}from"../chunks/Tip.1d9b8c37.js";import{D as ve}from"../chunks/Docstring.567bc132.js";import{C as qe}from"../chunks/CodeBlock.a9c4becf.js";import{E as Ae}from"../chunks/ExampleCodeBlock.15b54358.js";import{H as be,E as Qe}from"../chunks/index.5d4ab994.js";function Ye(H){let a,L='Make sure to check out the Schedulers <a href="../../using-diffusers/schedulers">guide</a> to learn how to explore the tradeoff between scheduler speed and quality, and see the <a href="../../using-diffusers/loading#reuse-a-pipeline">reuse components across pipelines</a> section to learn how to efficiently load the same components into multiple pipelines. Also, to know more about reducing the memory usage of this pipeline, refer to the [“Reduce memory usage”] section <a href="../../using-diffusers/svd#reduce-memory-usage">here</a>.';return{c(){a=r("p"),a.innerHTML=L},l(c){a=l(c,"P",{"data-svelte-h":!0}),g(a)!=="svelte-1u6tg32"&&(a.innerHTML=L)},m(c,f){i(c,a,f)},p:Ne,d(c){c&&n(a)}}}function Ke(H){let a,L="Examples:",c,f,h;return f=new qe({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwSTJWR2VuWExQaXBlbGluZSUwQWZyb20lMjBkaWZmdXNlcnMudXRpbHMlMjBpbXBvcnQlMjBleHBvcnRfdG9fZ2lmJTJDJTIwbG9hZF9pbWFnZSUwQSUwQXBpcGVsaW5lJTIwJTNEJTIwSTJWR2VuWExQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyYWxpLXZpbGFiJTJGaTJ2Z2VuLXhsJTIyJTJDJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2JTJDJTIwdmFyaWFudCUzRCUyMmZwMTYlMjIlMEEpJTBBcGlwZWxpbmUuZW5hYmxlX21vZGVsX2NwdV9vZmZsb2FkKCklMEElMEFpbWFnZV91cmwlMjAlM0QlMjAoJTBBJTIwJTIwJTIwJTIwJTIyaHR0cHMlM0ElMkYlMkZodWdnaW5nZmFjZS5jbyUyRmRhdGFzZXRzJTJGZGlmZnVzZXJzJTJGZG9jcy1pbWFnZXMlMkZyZXNvbHZlJTJGbWFpbiUyRmkydmdlbl94bF9pbWFnZXMlMkZpbWdfMDAwOS5wbmclMjIlMEEpJTBBaW1hZ2UlMjAlM0QlMjBsb2FkX2ltYWdlKGltYWdlX3VybCkuY29udmVydCglMjJSR0IlMjIpJTBBJTBBcHJvbXB0JTIwJTNEJTIwJTIyUGFwZXJzJTIwd2VyZSUyMGZsb2F0aW5nJTIwaW4lMjB0aGUlMjBhaXIlMjBvbiUyMGElMjB0YWJsZSUyMGluJTIwdGhlJTIwbGlicmFyeSUyMiUwQW5lZ2F0aXZlX3Byb21wdCUyMCUzRCUyMCUyMkRpc3RvcnRlZCUyQyUyMGRpc2NvbnRpbnVvdXMlMkMlMjBVZ2x5JTJDJTIwYmx1cnJ5JTJDJTIwbG93JTIwcmVzb2x1dGlvbiUyQyUyMG1vdGlvbmxlc3MlMkMlMjBzdGF0aWMlMkMlMjBkaXNmaWd1cmVkJTJDJTIwZGlzY29ubmVjdGVkJTIwbGltYnMlMkMlMjBVZ2x5JTIwZmFjZXMlMkMlMjBpbmNvbXBsZXRlJTIwYXJtcyUyMiUwQWdlbmVyYXRvciUyMCUzRCUyMHRvcmNoLm1hbnVhbF9zZWVkKDg4ODgpJTBBJTBBZnJhbWVzJTIwJTNEJTIwcGlwZWxpbmUoJTBBJTIwJTIwJTIwJTIwcHJvbXB0JTNEcHJvbXB0JTJDJTBBJTIwJTIwJTIwJTIwaW1hZ2UlM0RpbWFnZSUyQyUwQSUyMCUyMCUyMCUyMG51bV9pbmZlcmVuY2Vfc3RlcHMlM0Q1MCUyQyUwQSUyMCUyMCUyMCUyMG5lZ2F0aXZlX3Byb21wdCUzRG5lZ2F0aXZlX3Byb21wdCUyQyUwQSUyMCUyMCUyMCUyMGd1aWRhbmNlX3NjYWxlJTNEOS4wJTJDJTBBJTIwJTIwJTIwJTIwZ2VuZXJhdG9yJTNEZ2VuZXJhdG9yJTJDJTBBKS5mcmFtZXMlNUIwJTVEJTBBdmlkZW9fcGF0aCUyMCUzRCUyMGV4cG9ydF90b19naWYoZnJhbWVzJTJDJTIwJTIyaTJ2LmdpZiUyMik=",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> I2VGenXLPipeline
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_gif, load_image

	<span class="hljs-meta">>>> </span>pipeline = I2VGenXLPipeline.from_pretrained(
	<span class="hljs-meta">... </span> <span class="hljs-string">"ali-vilab/i2vgen-xl"</span>, torch_dtype=torch.float16, variant=<span class="hljs-string">"fp16"</span>
	<span class="hljs-meta">... </span>)
	<span class="hljs-meta">>>> </span>pipeline.enable_model_cpu_offload()

	<span class="hljs-meta">>>> </span>image_url = (
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"</span>
	<span class="hljs-meta">... </span>)
	<span class="hljs-meta">>>> </span>image = load_image(image_url).convert(<span class="hljs-string">"RGB"</span>)

	<span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"Papers were floating in the air on a table in the library"</span>
	<span class="hljs-meta">>>> </span>negative_prompt = <span class="hljs-string">"Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"</span>
	<span class="hljs-meta">>>> </span>generator = torch.manual_seed(<span class="hljs-number">8888</span>)

	<span class="hljs-meta">>>> </span>frames = pipeline(
	<span class="hljs-meta">... </span> prompt=prompt,
	<span class="hljs-meta">... </span> image=image,
	<span class="hljs-meta">... </span> num_inference_steps=<span class="hljs-number">50</span>,
	<span class="hljs-meta">... </span> negative_prompt=negative_prompt,
	<span class="hljs-meta">... </span> guidance_scale=<span class="hljs-number">9.0</span>,
	<span class="hljs-meta">... </span> generator=generator,
	<span class="hljs-meta">... </span>).frames[<span class="hljs-number">0</span>]
	<span class="hljs-meta">>>> </span>video_path = export_to_gif(frames, <span class="hljs-string">"i2v.gif"</span>)`,wrap:!1}}),{c(){a=r("p"),a.textContent=L,c=s(),_(f.$$.fragment)},l(p){a=l(p,"P",{"data-svelte-h":!0}),g(a)!=="svelte-kvfsh7"&&(a.textContent=L),c=o(p),v(f.$$.fragment,p)},m(p,x){i(p,a,x),i(p,c,x),b(f,p,x),h=!0},p:Ne,i(p){h\|\|(y(f.$$.fragment,p),h=!0)},o(p){I(f.$$.fragment,p),h=!1},d(p){p&&(n(a),n(c)),w(f,p)}}}function et(H){let a,L,c,f,h,p,x,Ve='<a href="https://hf.co/papers/2311.04145.pdf" rel="nofollow">I2VGen-XL: High-Quality Image-to-Video Synthesis via Cascaded Diffusion Models</a> by Shiwei Zhang, Jiayu Wang, Yingya Zhang, Kang Zhao, Hangjie Yuan, Zhiwu Qin, Xiang Wang, Deli Zhao, and Jingren Zhou.',ne,P,Xe="The abstract from the paper is:",ie,$,Pe='<em>Video synthesis has recently made remarkable strides benefiting from the rapid development of diffusion models. However, it still encounters challenges in terms of semantic accuracy, clarity and spatio-temporal continuity. They primarily arise from the scarcity of well-aligned text-video data and the complex inherent structure of videos, making it difficult for the model to simultaneously ensure semantic and qualitative excellence. In this report, we propose a cascaded I2VGen-XL approach that enhances model performance by decoupling these two factors and ensures the alignment of the input data by utilizing static images as a form of crucial guidance. I2VGen-XL consists of two stages: i) the base stage guarantees coherent semantics and preserves content from input images by using two hierarchical encoders, and ii) the refinement stage enhances the video’s details by incorporating an additional brief text and improves the resolution to 1280×720. To improve the diversity, we collect around 35 million single-shot text-video pairs and 6 billion text-image pairs to optimize the model. By this means, I2VGen-XL can simultaneously enhance the semantic accuracy, continuity of details and clarity of generated videos. Through extensive experiments, we have investigated the underlying principles of I2VGen-XL and compared it with current top methods, which can demonstrate its effectiveness on diverse data. The source code and models will be publicly available at <a href="https://i2vgen-xl.github.io/" rel="nofollow">this https URL</a>.</em>',se,k,$e='The original codebase can be found <a href="https://github.com/ali-vilab/i2vgen-xl/" rel="nofollow">here</a>. The model checkpoints can be found <a href="https://huggingface.co/ali-vilab/" rel="nofollow">here</a>.',oe,J,ae,C,ke="Sample output with I2VGenXL:",re,j,Ce=`<tbody><tr><td><center>library.
	<br/> <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/i2vgen-xl-example.gif" alt="library" style="width: 300px;"/></center></td></tr></tbody>`,le,Z,pe,U,je='<li>I2VGenXL always uses a <code>clip_skip</code> value of 1. This means it leverages the penultimate layer representations from the text encoder of CLIP.</li> <li>It can generate videos of quality that is often on par with <a href="../../using-diffusers/svd">Stable Video Diffusion</a> (SVD).</li> <li>Unlike SVD, it additionally accepts text prompts as inputs.</li> <li>It can generate higher resolution videos.</li> <li>When using the <a href="/docs/diffusers/pr_11335/en/api/schedulers/ddim#diffusers.DDIMScheduler">DDIMScheduler</a> (which is default for this pipeline), less than 50 steps for inference leads to bad results.</li> <li>This implementation is 1-stage variant of I2VGenXL. The main figure in the <a href="https://arxiv.org/abs/2311.04145" rel="nofollow">I2VGen-XL</a> paper shows a 2-stage variant, however, 1-stage variant works well. See <a href="https://github.com/huggingface/diffusers/discussions/7952" rel="nofollow">this discussion</a> for more details.</li>',de,B,ce,m,D,ye,R,Ze='Pipeline for image-to-video generation as proposed in <a href="https://i2vgen-xl.github.io/" rel="nofollow">I2VGenXL</a>.',Ie,O,Ue=`This model inherits from <a href="/docs/diffusers/pr_11335/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a>. Check the superclass documentation for the generic methods
	implemented for all pipelines (downloading, saving, running on a particular device, etc.).`,we,G,W,xe,F,Be='The call function to the pipeline for image-to-video generation with <a href="/docs/diffusers/pr_11335/en/api/pipelines/i2vgenxl#diffusers.I2VGenXLPipeline">I2VGenXLPipeline</a>.',Te,V,Le,X,z,Ge,q,De="Encodes the prompt into text encoder hidden states.",me,N,ge,T,S,Me,A,We="Output class for image-to-video pipeline.",Je,Q,ze=`PIL image sequences of length <code>num_frames.</code> It can also be a NumPy array or Torch tensor of shape
	<code>(batch_size, num_frames, channels, height, width)</code>`,fe,E,ue,te,he;return h=new be({props:{title:"I2VGen-XL",local:"i2vgen-xl",headingTag:"h1"}}),J=new Fe({props:{$$slots:{default:[Ye]},$$scope:{ctx:H}}}),Z=new be({props:{title:"Notes",local:"notes",headingTag:"h2"}}),B=new be({props:{title:"I2VGenXLPipeline",local:"diffusers.I2VGenXLPipeline",headingTag:"h2"}}),D=new ve({props:{name:"class diffusers.I2VGenXLPipeline",anchor:"diffusers.I2VGenXLPipeline",parameters:[{name:"vae",val:": AutoencoderKL"},{name:"text_encoder",val:": CLIPTextModel"},{name:"tokenizer",val:": CLIPTokenizer"},{name:"image_encoder",val:": CLIPVisionModelWithProjection"},{name:"feature_extractor",val:": CLIPImageProcessor"},{name:"unet",val:": I2VGenXLUNet"},{name:"scheduler",val:": DDIMScheduler"}],parametersDescription:[{anchor:"diffusers.I2VGenXLPipeline.vae",description:`<strong>vae</strong> (<a href="/docs/diffusers/pr_11335/en/api/models/autoencoderkl#diffusers.AutoencoderKL">AutoencoderKL</a>) —
	Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.`,name:"vae"},{anchor:"diffusers.I2VGenXLPipeline.text_encoder",description:`<strong>text_encoder</strong> (<code>CLIPTextModel</code>) —
	Frozen text-encoder (<a href="https://huggingface.co/openai/clip-vit-large-patch14" rel="nofollow">clip-vit-large-patch14</a>).`,name:"text_encoder"},{anchor:"diffusers.I2VGenXLPipeline.tokenizer",description:`<strong>tokenizer</strong> (<code>CLIPTokenizer</code>) —
	A <a href="https://huggingface.co/docs/transformers/main/en/model_doc/clip#transformers.CLIPTokenizer" rel="nofollow">CLIPTokenizer</a> to tokenize text.`,name:"tokenizer"},{anchor:"diffusers.I2VGenXLPipeline.unet",description:`<strong>unet</strong> (<code>I2VGenXLUNet</code>) —
	A <code>I2VGenXLUNet</code> to denoise the encoded video latents.`,name:"unet"},{anchor:"diffusers.I2VGenXLPipeline.scheduler",description:`<strong>scheduler</strong> (<a href="/docs/diffusers/pr_11335/en/api/schedulers/ddim#diffusers.DDIMScheduler">DDIMScheduler</a>) —
	A scheduler to be used in combination with <code>unet</code> to denoise the encoded image latents.`,name:"scheduler"}],source:"https://github.com/huggingface/diffusers/blob/vr_11335/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py#L99"}}),W=new ve({props:{name:"__call__",anchor:"diffusers.I2VGenXLPipeline.__call__",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]] = None"},{name:"image",val:": typing.Union[PIL.Image.Image, numpy.ndarray, torch.Tensor, typing.List[PIL.Image.Image], typing.List[numpy.ndarray], typing.List[torch.Tensor]] = None"},{name:"height",val:": typing.Optional[int] = 704"},{name:"width",val:": typing.Optional[int] = 1280"},{name:"target_fps",val:": typing.Optional[int] = 16"},{name:"num_frames",val:": int = 16"},{name:"num_inference_steps",val:": int = 50"},{name:"guidance_scale",val:": float = 9.0"},{name:"negative_prompt",val:": typing.Union[str, typing.List[str], NoneType] = None"},{name:"eta",val:": float = 0.0"},{name:"num_videos_per_prompt",val:": typing.Optional[int] = 1"},{name:"decode_chunk_size",val:": typing.Optional[int] = 1"},{name:"generator",val:": typing.Union[torch._C.Generator, typing.List[torch._C.Generator], NoneType] = None"},{name:"latents",val:": typing.Optional[torch.Tensor] = None"},{name:"prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"output_type",val:": typing.Optional[str] = 'pil'"},{name:"return_dict",val:": bool = True"},{name:"cross_attention_kwargs",val:": typing.Optional[typing.Dict[str, typing.Any]] = None"},{name:"clip_skip",val:": typing.Optional[int] = 1"}],parametersDescription:[{anchor:"diffusers.I2VGenXLPipeline.__call__.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) —
	The prompt or prompts to guide image generation. If not defined, you need to pass <code>prompt_embeds</code>.`,name:"prompt"},{anchor:"diffusers.I2VGenXLPipeline.__call__.image",description:`<strong>image</strong> (<code>PIL.Image.Image</code> or <code>List[PIL.Image.Image]</code> or <code>torch.Tensor</code>) —
	Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
	<a href="https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json" rel="nofollow"><code>CLIPImageProcessor</code></a>.`,name:"image"},{anchor:"diffusers.I2VGenXLPipeline.__call__.height",description:`<strong>height</strong> (<code>int</code>, <em>optional</em>, defaults to <code>self.unet.config.sample_size * self.vae_scale_factor</code>) —
	The height in pixels of the generated image.`,name:"height"},{anchor:"diffusers.I2VGenXLPipeline.__call__.width",description:`<strong>width</strong> (<code>int</code>, <em>optional</em>, defaults to <code>self.unet.config.sample_size * self.vae_scale_factor</code>) —
	The width in pixels of the generated image.`,name:"width"},{anchor:"diffusers.I2VGenXLPipeline.__call__.target_fps",description:`<strong>target_fps</strong> (<code>int</code>, <em>optional</em>) —
	Frames per second. The rate at which the generated images shall be exported to a video after
	generation. This is also used as a “micro-condition” while generation.`,name:"target_fps"},{anchor:"diffusers.I2VGenXLPipeline.__call__.num_frames",description:`<strong>num_frames</strong> (<code>int</code>, <em>optional</em>) —
	The number of video frames to generate.`,name:"num_frames"},{anchor:"diffusers.I2VGenXLPipeline.__call__.num_inference_steps",description:`<strong>num_inference_steps</strong> (<code>int</code>, <em>optional</em>) —
	The number of denoising steps.`,name:"num_inference_steps"},{anchor:"diffusers.I2VGenXLPipeline.__call__.guidance_scale",description:`<strong>guidance_scale</strong> (<code>float</code>, <em>optional</em>, defaults to 7.5) —
	A higher guidance scale value encourages the model to generate images closely linked to the text
	<code>prompt</code> at the expense of lower image quality. Guidance scale is enabled when <code>guidance_scale > 1</code>.`,name:"guidance_scale"},{anchor:"diffusers.I2VGenXLPipeline.__call__.negative_prompt",description:`<strong>negative_prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) —
	The prompt or prompts to guide what to not include in image generation. If not defined, you need to
	pass <code>negative_prompt_embeds</code> instead. Ignored when not using guidance (<code>guidance_scale < 1</code>).`,name:"negative_prompt"},{anchor:"diffusers.I2VGenXLPipeline.__call__.eta",description:`<strong>eta</strong> (<code>float</code>, <em>optional</em>) —
	Corresponds to parameter eta (η) from the <a href="https://arxiv.org/abs/2010.02502" rel="nofollow">DDIM</a> paper. Only applies
	to the <a href="/docs/diffusers/pr_11335/en/api/schedulers/ddim#diffusers.DDIMScheduler">DDIMScheduler</a>, and is ignored in other schedulers.`,name:"eta"},{anchor:"diffusers.I2VGenXLPipeline.__call__.num_videos_per_prompt",description:`<strong>num_videos_per_prompt</strong> (<code>int</code>, <em>optional</em>) —
	The number of images to generate per prompt.`,name:"num_videos_per_prompt"},{anchor:"diffusers.I2VGenXLPipeline.__call__.decode_chunk_size",description:`<strong>decode_chunk_size</strong> (<code>int</code>, <em>optional</em>) —
	The number of frames to decode at a time. The higher the chunk size, the higher the temporal
	consistency between frames, but also the higher the memory consumption. By default, the decoder will
	decode all frames at once for maximal quality. Reduce <code>decode_chunk_size</code> to reduce memory usage.`,name:"decode_chunk_size"},{anchor:"diffusers.I2VGenXLPipeline.__call__.generator",description:`<strong>generator</strong> (<code>torch.Generator</code> or <code>List[torch.Generator]</code>, <em>optional</em>) —
	A <a href="https://pytorch.org/docs/stable/generated/torch.Generator.html" rel="nofollow"><code>torch.Generator</code></a> to make
	generation deterministic.`,name:"generator"},{anchor:"diffusers.I2VGenXLPipeline.__call__.latents",description:`<strong>latents</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor is generated by sampling using the supplied random <code>generator</code>.`,name:"latents"},{anchor:"diffusers.I2VGenXLPipeline.__call__.prompt_embeds",description:`<strong>prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
	provided, text embeddings are generated from the <code>prompt</code> input argument.`,name:"prompt_embeds"},{anchor:"diffusers.I2VGenXLPipeline.__call__.negative_prompt_embeds",description:`<strong>negative_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
	not provided, <code>negative_prompt_embeds</code> are generated from the <code>negative_prompt</code> input argument.`,name:"negative_prompt_embeds"},{anchor:"diffusers.I2VGenXLPipeline.__call__.output_type",description:`<strong>output_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"pil"</code>) —
	The output format of the generated image. Choose between <code>PIL.Image</code> or <code>np.array</code>.`,name:"output_type"},{anchor:"diffusers.I2VGenXLPipeline.__call__.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) —
	Whether or not to return a <a href="/docs/diffusers/pr_11335/en/api/pipelines/stable_diffusion/img2img#diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput">StableDiffusionPipelineOutput</a> instead of a
	plain tuple.`,name:"return_dict"},{anchor:"diffusers.I2VGenXLPipeline.__call__.cross_attention_kwargs",description:`<strong>cross_attention_kwargs</strong> (<code>dict</code>, <em>optional</em>) —
	A kwargs dictionary that if specified is passed along to the <code>AttentionProcessor</code> as defined in
	<a href="https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py" rel="nofollow"><code>self.processor</code></a>.`,name:"cross_attention_kwargs"},{anchor:"diffusers.I2VGenXLPipeline.__call__.clip_skip",description:`<strong>clip_skip</strong> (<code>int</code>, <em>optional</em>) —
	Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
	the output of the pre-final layer will be used for computing the prompt embeddings.`,name:"clip_skip"}],source:"https://github.com/huggingface/diffusers/blob/vr_11335/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py#L508",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>If <code>return_dict</code> is <code>True</code>, <a
	href="/docs/diffusers/pr_11335/en/api/pipelines/i2vgenxl#diffusers.pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput"
	>pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput</a> is
	returned, otherwise a <code>tuple</code> is returned where the first element is a list with the generated frames.</p>
	`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


	<p><a
	href="/docs/diffusers/pr_11335/en/api/pipelines/i2vgenxl#diffusers.pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput"
	>pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput</a> or <code>tuple</code></p>
	`}}),V=new Ae({props:{anchor:"diffusers.I2VGenXLPipeline.__call__.example",$$slots:{default:[Ke]},$$scope:{ctx:H}}}),z=new ve({props:{name:"encode_prompt",anchor:"diffusers.I2VGenXLPipeline.encode_prompt",parameters:[{name:"prompt",val:""},{name:"device",val:""},{name:"num_videos_per_prompt",val:""},{name:"negative_prompt",val:" = None"},{name:"prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"clip_skip",val:": typing.Optional[int] = None"}],parametersDescription:[{anchor:"diffusers.I2VGenXLPipeline.encode_prompt.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) —
	prompt to be encoded`,name:"prompt"},{anchor:"diffusers.I2VGenXLPipeline.encode_prompt.device",description:`<strong>device</strong> — (<code>torch.device</code>):
	torch device`,name:"device"},{anchor:"diffusers.I2VGenXLPipeline.encode_prompt.num_videos_per_prompt",description:`<strong>num_videos_per_prompt</strong> (<code>int</code>) —
	number of images that should be generated per prompt`,name:"num_videos_per_prompt"},{anchor:"diffusers.I2VGenXLPipeline.encode_prompt.do_classifier_free_guidance",description:`<strong>do_classifier_free_guidance</strong> (<code>bool</code>) —
	whether to use classifier free guidance or not`,name:"do_classifier_free_guidance"},{anchor:"diffusers.I2VGenXLPipeline.encode_prompt.negative_prompt",description:`<strong>negative_prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) —
	The prompt or prompts not to guide the image generation. If not defined, one has to pass
	<code>negative_prompt_embeds</code> instead. Ignored when not using guidance (i.e., ignored if <code>guidance_scale</code> is
	less than <code>1</code>).`,name:"negative_prompt"},{anchor:"diffusers.I2VGenXLPipeline.encode_prompt.prompt_embeds",description:`<strong>prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated text embeddings. Can be used to easily tweak text inputs, <em>e.g.</em> prompt weighting. If not
	provided, text embeddings will be generated from <code>prompt</code> input argument.`,name:"prompt_embeds"},{anchor:"diffusers.I2VGenXLPipeline.encode_prompt.negative_prompt_embeds",description:`<strong>negative_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated negative text embeddings. Can be used to easily tweak text inputs, <em>e.g.</em> prompt
	weighting. If not provided, negative_prompt_embeds will be generated from <code>negative_prompt</code> input
	argument.`,name:"negative_prompt_embeds"},{anchor:"diffusers.I2VGenXLPipeline.encode_prompt.clip_skip",description:`<strong>clip_skip</strong> (<code>int</code>, <em>optional</em>) —
	Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
	the output of the pre-final layer will be used for computing the prompt embeddings.`,name:"clip_skip"}],source:"https://github.com/huggingface/diffusers/blob/vr_11335/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py#L160"}}),N=new be({props:{title:"I2VGenXLPipelineOutput",local:"diffusers.pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput",headingTag:"h2"}}),S=new ve({props:{name:"class diffusers.pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput",anchor:"diffusers.pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput",parameters:[{name:"frames",val:": typing.Union[torch.Tensor, numpy.ndarray, typing.List[typing.List[PIL.Image.Image]]]"}],parametersDescription:[{anchor:"diffusers.pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput.frames",description:`<strong>frames</strong> (<code>torch.Tensor</code>, <code>np.ndarray</code>, or List[List[PIL.Image.Image]]) —
	List of video outputs - It can be a nested list of length <code>batch_size,</code> with each sub-list containing
	denoised`,name:"frames"}],source:"https://github.com/huggingface/diffusers/blob/vr_11335/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py#L83"}}),E=new Qe({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/i2vgenxl.md"}}),{c(){a=r("meta"),L=s(),c=r("p"),f=s(),_(h.$$.fragment),p=s(),x=r("p"),x.innerHTML=Ve,ne=s(),P=r("p"),P.textContent=Xe,ie=s(),$=r("p"),$.innerHTML=Pe,se=s(),k=r("p"),k.innerHTML=$e,oe=s(),_(J.$$.fragment),ae=s(),C=r("p"),C.textContent=ke,re=s(),j=r("table"),j.innerHTML=Ce,le=s(),_(Z.$$.fragment),pe=s(),U=r("ul"),U.innerHTML=je,de=s(),_(B.$$.fragment),ce=s(),m=r("div"),_(D.$$.fragment),ye=s(),R=r("p"),R.innerHTML=Ze,Ie=s(),O=r("p"),O.innerHTML=Ue,we=s(),G=r("div"),_(W.$$.fragment),xe=s(),F=r("p"),F.innerHTML=Be,Te=s(),_(V.$$.fragment),Le=s(),X=r("div"),_(z.$$.fragment),Ge=s(),q=r("p"),q.textContent=De,me=s(),_(N.$$.fragment),ge=s(),T=r("div"),_(S.$$.fragment),Me=s(),A=r("p"),A.textContent=We,Je=s(),Q=r("p"),Q.innerHTML=ze,fe=s(),_(E.$$.fragment),ue=s(),te=r("p"),this.h()},l(e){const t=Oe("svelte-u9bgzb",document.head);a=l(t,"META",{name:!0,content:!0}),t.forEach(n),L=o(e),c=l(e,"P",{}),K(c).forEach(n),f=o(e),v(h.$$.fragment,e),p=o(e),x=l(e,"P",{"data-svelte-h":!0}),g(x)!=="svelte-1wrjkyv"&&(x.innerHTML=Ve),ne=o(e),P=l(e,"P",{"data-svelte-h":!0}),g(P)!=="svelte-1cwsb16"&&(P.textContent=Xe),ie=o(e),$=l(e,"P",{"data-svelte-h":!0}),g($)!=="svelte-uj6rwn"&&($.innerHTML=Pe),se=o(e),k=l(e,"P",{"data-svelte-h":!0}),g(k)!=="svelte-1okzf4k"&&(k.innerHTML=$e),oe=o(e),v(J.$$.fragment,e),ae=o(e),C=l(e,"P",{"data-svelte-h":!0}),g(C)!=="svelte-e24chm"&&(C.textContent=ke),re=o(e),j=l(e,"TABLE",{"data-svelte-h":!0}),g(j)!=="svelte-1d0ljr7"&&(j.innerHTML=Ce),le=o(e),v(Z.$$.fragment,e),pe=o(e),U=l(e,"UL",{"data-svelte-h":!0}),g(U)!=="svelte-fxj4q"&&(U.innerHTML=je),de=o(e),v(B.$$.fragment,e),ce=o(e),m=l(e,"DIV",{class:!0});var u=K(m);v(D.$$.fragment,u),ye=o(u),R=l(u,"P",{"data-svelte-h":!0}),g(R)!=="svelte-va7c9e"&&(R.innerHTML=Ze),Ie=o(u),O=l(u,"P",{"data-svelte-h":!0}),g(O)!=="svelte-2o0kj9"&&(O.innerHTML=Ue),we=o(u),G=l(u,"DIV",{class:!0});var M=K(G);v(W.$$.fragment,M),xe=o(M),F=l(M,"P",{"data-svelte-h":!0}),g(F)!=="svelte-1r8lwqq"&&(F.innerHTML=Be),Te=o(M),v(V.$$.fragment,M),M.forEach(n),Le=o(u),X=l(u,"DIV",{class:!0});var _e=K(X);v(z.$$.fragment,_e),Ge=o(_e),q=l(_e,"P",{"data-svelte-h":!0}),g(q)!=="svelte-16q0ax1"&&(q.textContent=De),_e.forEach(n),u.forEach(n),me=o(e),v(N.$$.fragment,e),ge=o(e),T=l(e,"DIV",{class:!0});var Y=K(T);v(S.$$.fragment,Y),Me=o(Y),A=l(Y,"P",{"data-svelte-h":!0}),g(A)!=="svelte-ulnuw9"&&(A.textContent=We),Je=o(Y),Q=l(Y,"P",{"data-svelte-h":!0}),g(Q)!=="svelte-gk6g69"&&(Q.innerHTML=ze),Y.forEach(n),fe=o(e),v(E.$$.fragment,e),ue=o(e),te=l(e,"P",{}),K(te).forEach(n),this.h()},h(){ee(a,"name","hf:doc:metadata"),ee(a,"content",tt),ee(G,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),ee(X,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),ee(m,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),ee(T,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,t){d(document.head,a),i(e,L,t),i(e,c,t),i(e,f,t),b(h,e,t),i(e,p,t),i(e,x,t),i(e,ne,t),i(e,P,t),i(e,ie,t),i(e,$,t),i(e,se,t),i(e,k,t),i(e,oe,t),b(J,e,t),i(e,ae,t),i(e,C,t),i(e,re,t),i(e,j,t),i(e,le,t),b(Z,e,t),i(e,pe,t),i(e,U,t),i(e,de,t),b(B,e,t),i(e,ce,t),i(e,m,t),b(D,m,null),d(m,ye),d(m,R),d(m,Ie),d(m,O),d(m,we),d(m,G),b(W,G,null),d(G,xe),d(G,F),d(G,Te),b(V,G,null),d(m,Le),d(m,X),b(z,X,null),d(X,Ge),d(X,q),i(e,me,t),b(N,e,t),i(e,ge,t),i(e,T,t),b(S,T,null),d(T,Me),d(T,A),d(T,Je),d(T,Q),i(e,fe,t),b(E,e,t),i(e,ue,t),i(e,te,t),he=!0},p(e,[t]){const u={};t&2&&(u.$$scope={dirty:t,ctx:e}),J.$set(u);const M={};t&2&&(M.$$scope={dirty:t,ctx:e}),V.$set(M)},i(e){he\|\|(y(h.$$.fragment,e),y(J.$$.fragment,e),y(Z.$$.fragment,e),y(B.$$.fragment,e),y(D.$$.fragment,e),y(W.$$.fragment,e),y(V.$$.fragment,e),y(z.$$.fragment,e),y(N.$$.fragment,e),y(S.$$.fragment,e),y(E.$$.fragment,e),he=!0)},o(e){I(h.$$.fragment,e),I(J.$$.fragment,e),I(Z.$$.fragment,e),I(B.$$.fragment,e),I(D.$$.fragment,e),I(W.$$.fragment,e),I(V.$$.fragment,e),I(z.$$.fragment,e),I(N.$$.fragment,e),I(S.$$.fragment,e),I(E.$$.fragment,e),he=!1},d(e){e&&(n(L),n(c),n(f),n(p),n(x),n(ne),n(P),n(ie),n($),n(se),n(k),n(oe),n(ae),n(C),n(re),n(j),n(le),n(pe),n(U),n(de),n(ce),n(m),n(me),n(ge),n(T),n(fe),n(ue),n(te)),n(a),w(h,e),w(J,e),w(Z,e),w(B,e),w(D),w(W),w(V),w(z),w(N,e),w(S),w(E,e)}}}const tt='{"title":"I2VGen-XL","local":"i2vgen-xl","sections":[{"title":"Notes","local":"notes","sections":[],"depth":2},{"title":"I2VGenXLPipeline","local":"diffusers.I2VGenXLPipeline","sections":[],"depth":2},{"title":"I2VGenXLPipelineOutput","local":"diffusers.pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput","sections":[],"depth":2}],"depth":1}';function nt(H){return Ee(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class dt extends He{constructor(a){super(),Re(this,a,nt,et,Se,{})}}export{dt as component};

Xet Storage Details

Size:: 31.1 kB
Xet hash:: 6d8b4fb97e659b6d78b72c37dee2aacd8a386fcaffc556f3395cdffa89c7d6db

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.