Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / diffusers /pr_10312 /en /_app /immutable /nodes /103.fd8e2808.js

rtrm's picture

about 1 month ago

36.4 kB

	import{s as wt,o as yt,n as vt}from"../chunks/scheduler.8c3d61f6.js";import{S as Lt,i as Mt,g as s,s as o,r as h,A as It,h as r,f as n,c as a,j as pe,u as _,x as p,k as me,y as f,a as i,v as b,d as T,t as x,w as v}from"../chunks/index.da70eac4.js";import{T as $t}from"../chunks/Tip.1d9b8c37.js";import{D as Se}from"../chunks/Docstring.6b390b9a.js";import{C as Ne}from"../chunks/CodeBlock.00a903b3.js";import{E as Pt}from"../chunks/ExampleCodeBlock.db12be95.js";import{H as Oe,E as kt}from"../chunks/EditOnGithub.1e64e623.js";function Jt(ie){let l,L='Make sure to check out the Schedulers <a href="../../using-diffusers/schedulers.md">guide</a> to learn how to explore the tradeoff between scheduler speed and quality, and see the <a href="../../using-diffusers/loading.md#reuse-a-pipeline">reuse components across pipelines</a> section to learn how to efficiently load the same components into multiple pipelines.';return{c(){l=s("p"),l.innerHTML=L},l(d){l=r(d,"P",{"data-svelte-h":!0}),p(l)!=="svelte-w7r39y"&&(l.innerHTML=L)},m(d,u){i(d,l,u)},p:vt,d(d){d&&n(l)}}}function jt(ie){let l,L="Examples:",d,u,w;return u=new Ne({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwTHVtaW5hVGV4dDJJbWdQaXBlbGluZSUwQSUwQXBpcGUlMjAlM0QlMjBMdW1pbmFUZXh0MkltZ1BpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJBbHBoYS1WTExNJTJGTHVtaW5hLU5leHQtU0ZULWRpZmZ1c2VycyUyMiUyQyUyMHRvcmNoX2R0eXBlJTNEdG9yY2guYmZsb2F0MTYlMEEpJTBBJTIzJTIwRW5hYmxlJTIwbWVtb3J5JTIwb3B0aW1pemF0aW9ucy4lMEFwaXBlLmVuYWJsZV9tb2RlbF9jcHVfb2ZmbG9hZCgpJTBBJTBBcHJvbXB0JTIwJTNEJTIwJTIyVXBwZXIlMjBib2R5JTIwb2YlMjBhJTIweW91bmclMjB3b21hbiUyMGluJTIwYSUyMFZpY3Rvcmlhbi1lcmElMjBvdXRmaXQlMjB3aXRoJTIwYnJhc3MlMjBnb2dnbGVzJTIwYW5kJTIwbGVhdGhlciUyMHN0cmFwcy4lMjBCYWNrZ3JvdW5kJTIwc2hvd3MlMjBhbiUyMGluZHVzdHJpYWwlMjByZXZvbHV0aW9uJTIwY2l0eXNjYXBlJTIwd2l0aCUyMHNtb2t5JTIwc2tpZXMlMjBhbmQlMjB0YWxsJTJDJTIwbWV0YWwlMjBzdHJ1Y3R1cmVzJTIyJTBBaW1hZ2UlMjAlM0QlMjBwaXBlKHByb21wdCkuaW1hZ2VzJTVCMCU1RA==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> LuminaText2ImgPipeline

	<span class="hljs-meta">>>> </span>pipe = LuminaText2ImgPipeline.from_pretrained(
	<span class="hljs-meta">... </span> <span class="hljs-string">"Alpha-VLLM/Lumina-Next-SFT-diffusers"</span>, torch_dtype=torch.bfloat16
	<span class="hljs-meta">... </span>)
	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Enable memory optimizations.</span>
	<span class="hljs-meta">>>> </span>pipe.enable_model_cpu_offload()

	<span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"Upper body of a young woman in a Victorian-era outfit with brass goggles and leather straps. Background shows an industrial revolution cityscape with smoky skies and tall, metal structures"</span>
	<span class="hljs-meta">>>> </span>image = pipe(prompt).images[<span class="hljs-number">0</span>]`,wrap:!1}}),{c(){l=s("p"),l.textContent=L,d=o(),h(u.$$.fragment)},l(m){l=r(m,"P",{"data-svelte-h":!0}),p(l)!=="svelte-kvfsh7"&&(l.textContent=L),d=a(m),_(u.$$.fragment,m)},m(m,y){i(m,l,y),i(m,d,y),b(u,m,y),w=!0},p:vt,i(m){w\|\|(T(u.$$.fragment,m),w=!0)},o(m){x(u.$$.fragment,m),w=!1},d(m){m&&(n(l),n(d)),v(u,m)}}}function Ct(ie){let l,L,d,u,w,m,y,Qe='<img src="https://github.com/Alpha-VLLM/Lumina-T2X/assets/54879512/9f52eabb-07dc-4881-8257-6d8a5f2a0a5a" alt="concepts"/>',de,J,Ke='<a href="https://github.com/Alpha-VLLM/Lumina-T2X/blob/main/assets/lumina-next.pdf" rel="nofollow">Lumina-Next : Making Lumina-T2X Stronger and Faster with Next-DiT</a> from Alpha-VLLM, OpenGVLab, Shanghai AI Laboratory.',ce,j,et="The abstract from the paper is:",ue,C,tt='<em>Lumina-T2X is a nascent family of Flow-based Large Diffusion Transformers (Flag-DiT) that establishes a unified framework for transforming noise into various modalities, such as images and videos, conditioned on text instructions. Despite its promising capabilities, Lumina-T2X still encounters challenges including training instability, slow inference, and extrapolation artifacts. In this paper, we present Lumina-Next, an improved version of Lumina-T2X, showcasing stronger generation performance with increased training and inference efficiency. We begin with a comprehensive analysis of the Flag-DiT architecture and identify several suboptimal components, which we address by introducing the Next-DiT architecture with 3D RoPE and sandwich normalizations. To enable better resolution extrapolation, we thoroughly compare different context extrapolation methods applied to text-to-image generation with 3D RoPE, and propose Frequency- and Time-Aware Scaled RoPE tailored for diffusion transformers. Additionally, we introduce a sigmoid time discretization schedule to reduce sampling steps in solving the Flow ODE and the Context Drop method to merge redundant visual tokens for faster network evaluation, effectively boosting the overall sampling speed. Thanks to these improvements, Lumina-Next not only improves the quality and efficiency of basic text-to-image generation but also demonstrates superior resolution extrapolation capabilities and multilingual generation using decoder-based LLMs as the text encoder, all in a zero-shot manner. To further validate Lumina-Next as a versatile generative framework, we instantiate it on diverse tasks including visual recognition, multi-view, audio, music, and point cloud generation, showcasing strong performance across these domains. By releasing all codes and model weights at <a href="https://github.com/Alpha-VLLM/Lumina-T2X" rel="nofollow">https://github.com/Alpha-VLLM/Lumina-T2X</a>, we aim to advance the development of next-generation generative AI capable of universal modeling.</em>',fe,B,nt="<strong>Highlights</strong>: Lumina-Next is a next-generation Diffusion Transformer that significantly enhances text-to-image generation, multilingual generation, and multitask performance by introducing the Next-DiT architecture, 3D RoPE, and frequency- and time-aware RoPE, among other improvements.",ge,V,it="Lumina-Next has the following components:",he,W,ot="<li>It improves sampling efficiency with fewer and faster Steps.</li> <li>It uses a Next-DiT as a transformer backbone with Sandwichnorm 3D RoPE, and Grouped-Query Attention.</li> <li>It uses a Frequency- and Time-Aware Scaled RoPE.</li>",_e,be,Te,H,at='<a href="https://arxiv.org/abs/2405.05945" rel="nofollow">Lumina-T2X: Transforming Text into Any Modality, Resolution, and Duration via Flow-based Large Diffusion Transformers</a> from Alpha-VLLM, OpenGVLab, Shanghai AI Laboratory.',xe,U,st="The abstract from the paper is:",ve,X,rt="<em>Sora unveils the potential of scaling Diffusion Transformer for generating photorealistic images and videos at arbitrary resolutions, aspect ratios, and durations, yet it still lacks sufficient implementation details. In this technical report, we introduce the Lumina-T2X family - a series of Flow-based Large Diffusion Transformers (Flag-DiT) equipped with zero-initialized attention, as a unified framework designed to transform noise into images, videos, multi-view 3D objects, and audio clips conditioned on text instructions. By tokenizing the latent spatial-temporal space and incorporating learnable placeholders such as [nextline] and [nextframe] tokens, Lumina-T2X seamlessly unifies the representations of different modalities across various spatial-temporal resolutions. This unified approach enables training within a single framework for different modalities and allows for flexible generation of multimodal data at any resolution, aspect ratio, and length during inference. Advanced techniques like RoPE, RMSNorm, and flow matching enhance the stability, flexibility, and scalability of Flag-DiT, enabling models of Lumina-T2X to scale up to 7 billion parameters and extend the context window to 128K tokens. This is particularly beneficial for creating ultra-high-definition images with our Lumina-T2I model and long 720p videos with our Lumina-T2V model. Remarkably, Lumina-T2I, powered by a 5-billion-parameter Flag-DiT, requires only 35% of the training computational costs of a 600-million-parameter naive DiT. Our further comprehensive analysis underscores Lumina-T2X’s preliminary capability in resolution extrapolation, high-resolution editing, generating consistent 3D views, and synthesizing videos with seamless transitions. We expect that the open-sourcing of Lumina-T2X will further foster creativity, transparency, and diversity in the generative AI community.</em>",we,Z,lt='You can find the original codebase at <a href="https://github.com/Alpha-VLLM/Lumina-T2X" rel="nofollow">Alpha-VLLM</a> and all the available checkpoints at <a href="https://huggingface.co/collections/Alpha-VLLM/lumina-family-66423205bedb81171fd0644b" rel="nofollow">Alpha-VLLM Lumina Family</a>.',ye,D,pt="<strong>Highlights</strong>: Lumina-T2X supports Any Modality, Resolution, and Duration.",Le,G,mt="Lumina-T2X has the following components:",Me,N,dt="<li>It uses a Flow-based Large Diffusion Transformer as the backbone</li> <li>It supports different any modalities with one backbone and corresponding encoder, decoder.</li>",Ie,A,ct='This pipeline was contributed by <a href="https://github.com/PommesPeter" rel="nofollow">PommesPeter</a>. The original codebase can be found <a href="https://github.com/Alpha-VLLM/Lumina-T2X" rel="nofollow">here</a>. The original weights can be found under <a href="https://huggingface.co/Alpha-VLLM" rel="nofollow">hf.co/Alpha-VLLM</a>.',$e,$,Pe,R,ke,q,ut='Use <a href="https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile" rel="nofollow"><code>torch.compile</code></a> to reduce the inference latency.',Je,F,ft="First, load the pipeline:",je,E,Ce,Y,gt="Then change the memory layout of the pipelines <code>transformer</code> and <code>vae</code> components to <code>torch.channels-last</code>:",Be,z,Ve,S,ht="Finally, compile the components and run inference:",We,O,He,Q,Ue,c,K,Ae,oe,_t="Pipeline for text-to-image generation using Lumina-T2I.",Re,ae,bt=`This model inherits from <a href="/docs/diffusers/pr_10312/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a>. Check the superclass documentation for the generic methods the
	library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)`,qe,M,ee,Fe,se,Tt="Function invoked when calling the pipeline for generation.",Ee,P,Ye,k,te,ze,re,xt="Encodes the prompt into text encoder hidden states.",Xe,ne,Ze,le,De;return w=new Oe({props:{title:"Lumina-T2X",local:"lumina-t2x",headingTag:"h1"}}),$=new $t({props:{$$slots:{default:[Jt]},$$scope:{ctx:ie}}}),R=new Oe({props:{title:"Inference (Text-to-Image)",local:"inference-text-to-image",headingTag:"h3"}}),E=new Ne({props:{code:"ZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMEx1bWluYVRleHQySW1nUGlwZWxpbmUlMEFpbXBvcnQlMjB0b3JjaCUwQSUwQXBpcGVsaW5lJTIwJTNEJTIwTHVtaW5hVGV4dDJJbWdQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTA5JTIyQWxwaGEtVkxMTSUyRkx1bWluYS1OZXh0LVNGVC1kaWZmdXNlcnMlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmJmbG9hdDE2JTBBKS50byglMjJjdWRhJTIyKQ==",highlighted:`<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> LuminaText2ImgPipeline
	<span class="hljs-keyword">import</span> torch

	pipeline = LuminaText2ImgPipeline.from_pretrained(
	<span class="hljs-string">"Alpha-VLLM/Lumina-Next-SFT-diffusers"</span>, torch_dtype=torch.bfloat16
	).to(<span class="hljs-string">"cuda"</span>)`,wrap:!1}}),z=new Ne({props:{code:"cGlwZWxpbmUudHJhbnNmb3JtZXIudG8obWVtb3J5X2Zvcm1hdCUzRHRvcmNoLmNoYW5uZWxzX2xhc3QpJTBBcGlwZWxpbmUudmFlLnRvKG1lbW9yeV9mb3JtYXQlM0R0b3JjaC5jaGFubmVsc19sYXN0KQ==",highlighted:`pipeline.transformer.to(memory_format=torch.channels_last)
	pipeline.vae.to(memory_format=torch.channels_last)`,wrap:!1}}),O=new Ne({props:{code:"cGlwZWxpbmUudHJhbnNmb3JtZXIlMjAlM0QlMjB0b3JjaC5jb21waWxlKHBpcGVsaW5lLnRyYW5zZm9ybWVyJTJDJTIwbW9kZSUzRCUyMm1heC1hdXRvdHVuZSUyMiUyQyUyMGZ1bGxncmFwaCUzRFRydWUpJTBBcGlwZWxpbmUudmFlLmRlY29kZSUyMCUzRCUyMHRvcmNoLmNvbXBpbGUocGlwZWxpbmUudmFlLmRlY29kZSUyQyUyMG1vZGUlM0QlMjJtYXgtYXV0b3R1bmUlMjIlMkMlMjBmdWxsZ3JhcGglM0RUcnVlKSUwQSUwQWltYWdlJTIwJTNEJTIwcGlwZWxpbmUocHJvbXB0JTNEJTIyVXBwZXIlMjBib2R5JTIwb2YlMjBhJTIweW91bmclMjB3b21hbiUyMGluJTIwYSUyMFZpY3Rvcmlhbi1lcmElMjBvdXRmaXQlMjB3aXRoJTIwYnJhc3MlMjBnb2dnbGVzJTIwYW5kJTIwbGVhdGhlciUyMHN0cmFwcy4lMjBCYWNrZ3JvdW5kJTIwc2hvd3MlMjBhbiUyMGluZHVzdHJpYWwlMjByZXZvbHV0aW9uJTIwY2l0eXNjYXBlJTIwd2l0aCUyMHNtb2t5JTIwc2tpZXMlMjBhbmQlMjB0YWxsJTJDJTIwbWV0YWwlMjBzdHJ1Y3R1cmVzJTIyKS5pbWFnZXMlNUIwJTVE",highlighted:`pipeline.transformer = torch.<span class="hljs-built_in">compile</span>(pipeline.transformer, mode=<span class="hljs-string">"max-autotune"</span>, fullgraph=<span class="hljs-literal">True</span>)
	pipeline.vae.decode = torch.<span class="hljs-built_in">compile</span>(pipeline.vae.decode, mode=<span class="hljs-string">"max-autotune"</span>, fullgraph=<span class="hljs-literal">True</span>)

	image = pipeline(prompt=<span class="hljs-string">"Upper body of a young woman in a Victorian-era outfit with brass goggles and leather straps. Background shows an industrial revolution cityscape with smoky skies and tall, metal structures"</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),Q=new Oe({props:{title:"LuminaText2ImgPipeline",local:"diffusers.LuminaText2ImgPipeline",headingTag:"h2"}}),K=new Se({props:{name:"class diffusers.LuminaText2ImgPipeline",anchor:"diffusers.LuminaText2ImgPipeline",parameters:[{name:"transformer",val:": LuminaNextDiT2DModel"},{name:"scheduler",val:": FlowMatchEulerDiscreteScheduler"},{name:"vae",val:": AutoencoderKL"},{name:"text_encoder",val:": AutoModel"},{name:"tokenizer",val:": AutoTokenizer"}],parametersDescription:[{anchor:"diffusers.LuminaText2ImgPipeline.vae",description:`<strong>vae</strong> (<a href="/docs/diffusers/pr_10312/en/api/models/autoencoderkl#diffusers.AutoencoderKL">AutoencoderKL</a>) —
	Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.`,name:"vae"},{anchor:"diffusers.LuminaText2ImgPipeline.text_encoder",description:`<strong>text_encoder</strong> (<code>AutoModel</code>) —
	Frozen text-encoder. Lumina-T2I uses
	<a href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel" rel="nofollow">T5</a>, specifically the
	<a href="https://huggingface.co/Alpha-VLLM/tree/main/t5-v1_1-xxl" rel="nofollow">t5-v1_1-xxl</a> variant.`,name:"text_encoder"},{anchor:"diffusers.LuminaText2ImgPipeline.tokenizer",description:`<strong>tokenizer</strong> (<code>AutoModel</code>) —
	Tokenizer of class
	<a href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel" rel="nofollow">AutoModel</a>.`,name:"tokenizer"},{anchor:"diffusers.LuminaText2ImgPipeline.transformer",description:`<strong>transformer</strong> (<a href="/docs/diffusers/pr_10312/en/api/models/transformer2d#diffusers.Transformer2DModel">Transformer2DModel</a>) —
	A text conditioned <code>Transformer2DModel</code> to denoise the encoded image latents.`,name:"transformer"},{anchor:"diffusers.LuminaText2ImgPipeline.scheduler",description:`<strong>scheduler</strong> (<a href="/docs/diffusers/pr_10312/en/api/schedulers/overview#diffusers.SchedulerMixin">SchedulerMixin</a>) —
	A scheduler to be used in combination with <code>transformer</code> to denoise the encoded image latents.`,name:"scheduler"}],source:"https://github.com/huggingface/diffusers/blob/vr_10312/src/diffusers/pipelines/lumina/pipeline_lumina.py#L127"}}),ee=new Se({props:{name:"__call__",anchor:"diffusers.LuminaText2ImgPipeline.__call__",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]] = None"},{name:"width",val:": typing.Optional[int] = None"},{name:"height",val:": typing.Optional[int] = None"},{name:"num_inference_steps",val:": int = 30"},{name:"guidance_scale",val:": float = 4.0"},{name:"negative_prompt",val:": typing.Union[str, typing.List[str]] = None"},{name:"sigmas",val:": typing.List[float] = None"},{name:"num_images_per_prompt",val:": typing.Optional[int] = 1"},{name:"generator",val:": typing.Union[torch._C.Generator, typing.List[torch._C.Generator], NoneType] = None"},{name:"latents",val:": typing.Optional[torch.Tensor] = None"},{name:"prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"prompt_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"output_type",val:": typing.Optional[str] = 'pil'"},{name:"return_dict",val:": bool = True"},{name:"clean_caption",val:": bool = True"},{name:"max_sequence_length",val:": int = 256"},{name:"scaling_watershed",val:": typing.Optional[float] = 1.0"},{name:"proportional_attn",val:": typing.Optional[bool] = True"}],parametersDescription:[{anchor:"diffusers.LuminaText2ImgPipeline.__call__.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) —
	The prompt or prompts to guide the image generation. If not defined, one has to pass <code>prompt_embeds</code>.
	instead.`,name:"prompt"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.negative_prompt",description:`<strong>negative_prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) —
	The prompt or prompts not to guide the image generation. If not defined, one has to pass
	<code>negative_prompt_embeds</code> instead. Ignored when not using guidance (i.e., ignored if <code>guidance_scale</code> is
	less than <code>1</code>).`,name:"negative_prompt"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.num_inference_steps",description:`<strong>num_inference_steps</strong> (<code>int</code>, <em>optional</em>, defaults to 30) —
	The number of denoising steps. More denoising steps usually lead to a higher quality image at the
	expense of slower inference.`,name:"num_inference_steps"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.sigmas",description:`<strong>sigmas</strong> (<code>List[float]</code>, <em>optional</em>) —
	Custom sigmas to use for the denoising process with schedulers which support a <code>sigmas</code> argument in
	their <code>set_timesteps</code> method. If not defined, the default behavior when <code>num_inference_steps</code> is passed
	will be used.`,name:"sigmas"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.guidance_scale",description:`<strong>guidance_scale</strong> (<code>float</code>, <em>optional</em>, defaults to 4.0) —
	Guidance scale as defined in <a href="https://arxiv.org/abs/2207.12598" rel="nofollow">Classifier-Free Diffusion Guidance</a>.
	<code>guidance_scale</code> is defined as <code>w</code> of equation 2. of <a href="https://arxiv.org/pdf/2205.11487.pdf" rel="nofollow">Imagen
	Paper</a>. Guidance scale is enabled by setting <code>guidance_scale > 1</code>. Higher guidance scale encourages to generate images that are closely linked to the text <code>prompt</code>,
	usually at the expense of lower image quality.`,name:"guidance_scale"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.num_images_per_prompt",description:`<strong>num_images_per_prompt</strong> (<code>int</code>, <em>optional</em>, defaults to 1) —
	The number of images to generate per prompt.`,name:"num_images_per_prompt"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.height",description:`<strong>height</strong> (<code>int</code>, <em>optional</em>, defaults to self.unet.config.sample_size) —
	The height in pixels of the generated image.`,name:"height"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.width",description:`<strong>width</strong> (<code>int</code>, <em>optional</em>, defaults to self.unet.config.sample_size) —
	The width in pixels of the generated image.`,name:"width"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.eta",description:`<strong>eta</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) —
	Corresponds to parameter eta (η) in the DDIM paper: <a href="https://arxiv.org/abs/2010.02502" rel="nofollow">https://arxiv.org/abs/2010.02502</a>. Only applies to
	<a href="/docs/diffusers/pr_10312/en/api/schedulers/ddim#diffusers.DDIMScheduler">schedulers.DDIMScheduler</a>, will be ignored for others.`,name:"eta"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.generator",description:`<strong>generator</strong> (<code>torch.Generator</code> or <code>List[torch.Generator]</code>, <em>optional</em>) —
	One or a list of <a href="https://pytorch.org/docs/stable/generated/torch.Generator.html" rel="nofollow">torch generator(s)</a>
	to make generation deterministic.`,name:"generator"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.latents",description:`<strong>latents</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor will ge generated by sampling using the supplied random <code>generator</code>.`,name:"latents"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.prompt_embeds",description:`<strong>prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated text embeddings. Can be used to easily tweak text inputs, <em>e.g.</em> prompt weighting. If not
	provided, text embeddings will be generated from <code>prompt</code> input argument.`,name:"prompt_embeds"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.prompt_attention_mask",description:"<strong>prompt_attention_mask</strong> (<code>torch.Tensor</code>, <em>optional</em>) — Pre-generated attention mask for text embeddings.",name:"prompt_attention_mask"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.negative_prompt_embeds",description:`<strong>negative_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated negative text embeddings. For Lumina-T2I this negative prompt should be "". If not
	provided, negative_prompt_embeds will be generated from <code>negative_prompt</code> input argument.`,name:"negative_prompt_embeds"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.negative_prompt_attention_mask",description:`<strong>negative_prompt_attention_mask</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated attention mask for negative text embeddings.`,name:"negative_prompt_attention_mask"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.output_type",description:`<strong>output_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"pil"</code>) —
	The output format of the generate image. Choose between
	<a href="https://pillow.readthedocs.io/en/stable/" rel="nofollow">PIL</a>: <code>PIL.Image.Image</code> or <code>np.array</code>.`,name:"output_type"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) —
	Whether or not to return a <code>~pipelines.stable_diffusion.IFPipelineOutput</code> instead of a plain tuple.`,name:"return_dict"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.clean_caption",description:`<strong>clean_caption</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) —
	Whether or not to clean the caption before creating embeddings. Requires <code>beautifulsoup4</code> and <code>ftfy</code> to
	be installed. If the dependencies are not installed, the embeddings will be created from the raw
	prompt.`,name:"clean_caption"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.max_sequence_length",description:`<strong>max_sequence_length</strong> (<code>int</code> defaults to 120) —
	Maximum sequence length to use with the <code>prompt</code>.`,name:"max_sequence_length"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.callback_on_step_end",description:`<strong>callback_on_step_end</strong> (<code>Callable</code>, <em>optional</em>) —
	A function that calls at the end of each denoising steps during the inference. The function is called
	with the following arguments: <code>callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)</code>. <code>callback_kwargs</code> will include a list of all tensors as specified by
	<code>callback_on_step_end_tensor_inputs</code>.`,name:"callback_on_step_end"},{anchor:"diffusers.LuminaText2ImgPipeline.__call__.callback_on_step_end_tensor_inputs",description:`<strong>callback_on_step_end_tensor_inputs</strong> (<code>List</code>, <em>optional</em>) —
	The list of tensor inputs for the <code>callback_on_step_end</code> function. The tensors specified in the list
	will be passed as <code>callback_kwargs</code> argument. You will only be able to include variables listed in the
	<code>._callback_tensor_inputs</code> attribute of your pipeline class.`,name:"callback_on_step_end_tensor_inputs"}],source:"https://github.com/huggingface/diffusers/blob/vr_10312/src/diffusers/pipelines/lumina/pipeline_lumina.py#L612",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


	<p>If <code>return_dict</code> is <code>True</code>, <a
	href="/docs/diffusers/pr_10312/en/api/pipelines/latent_diffusion#diffusers.ImagePipelineOutput"
	>ImagePipelineOutput</a> is returned, otherwise a <code>tuple</code> is
	returned where the first element is a list with the generated images</p>
	`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


	<p><a
	href="/docs/diffusers/pr_10312/en/api/pipelines/latent_diffusion#diffusers.ImagePipelineOutput"
	>ImagePipelineOutput</a> or <code>tuple</code></p>
	`}}),P=new Pt({props:{anchor:"diffusers.LuminaText2ImgPipeline.__call__.example",$$slots:{default:[jt]},$$scope:{ctx:ie}}}),te=new Se({props:{name:"encode_prompt",anchor:"diffusers.LuminaText2ImgPipeline.encode_prompt",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]]"},{name:"do_classifier_free_guidance",val:": bool = True"},{name:"negative_prompt",val:": typing.Union[str, typing.List[str]] = None"},{name:"num_images_per_prompt",val:": int = 1"},{name:"device",val:": typing.Optional[torch.device] = None"},{name:"prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"prompt_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"clean_caption",val:": bool = False"},{name:"**kwargs",val:""}],parametersDescription:[{anchor:"diffusers.LuminaText2ImgPipeline.encode_prompt.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) —
	prompt to be encoded`,name:"prompt"},{anchor:"diffusers.LuminaText2ImgPipeline.encode_prompt.negative_prompt",description:`<strong>negative_prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) —
	The prompt not to guide the image generation. If not defined, one has to pass <code>negative_prompt_embeds</code>
	instead. Ignored when not using guidance (i.e., ignored if <code>guidance_scale</code> is less than <code>1</code>). For
	Lumina-T2I, this should be "".`,name:"negative_prompt"},{anchor:"diffusers.LuminaText2ImgPipeline.encode_prompt.do_classifier_free_guidance",description:`<strong>do_classifier_free_guidance</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) —
	whether to use classifier free guidance or not`,name:"do_classifier_free_guidance"},{anchor:"diffusers.LuminaText2ImgPipeline.encode_prompt.num_images_per_prompt",description:`<strong>num_images_per_prompt</strong> (<code>int</code>, <em>optional</em>, defaults to 1) —
	number of images that should be generated per prompt`,name:"num_images_per_prompt"},{anchor:"diffusers.LuminaText2ImgPipeline.encode_prompt.device",description:`<strong>device</strong> — (<code>torch.device</code>, <em>optional</em>):
	torch device to place the resulting embeddings on`,name:"device"},{anchor:"diffusers.LuminaText2ImgPipeline.encode_prompt.prompt_embeds",description:`<strong>prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated text embeddings. Can be used to easily tweak text inputs, <em>e.g.</em> prompt weighting. If not
	provided, text embeddings will be generated from <code>prompt</code> input argument.`,name:"prompt_embeds"},{anchor:"diffusers.LuminaText2ImgPipeline.encode_prompt.negative_prompt_embeds",description:`<strong>negative_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) —
	Pre-generated negative text embeddings. For Lumina-T2I, it’s should be the embeddings of the "" string.`,name:"negative_prompt_embeds"},{anchor:"diffusers.LuminaText2ImgPipeline.encode_prompt.clean_caption",description:`<strong>clean_caption</strong> (<code>bool</code>, defaults to <code>False</code>) —
	If <code>True</code>, the function will preprocess and clean the provided caption before encoding.`,name:"clean_caption"},{anchor:"diffusers.LuminaText2ImgPipeline.encode_prompt.max_sequence_length",description:"<strong>max_sequence_length</strong> (<code>int</code>, defaults to 256) — Maximum sequence length to use for the prompt.",name:"max_sequence_length"}],source:"https://github.com/huggingface/diffusers/blob/vr_10312/src/diffusers/pipelines/lumina/pipeline_lumina.py#L252"}}),ne=new kt({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/lumina.md"}}),{c(){l=s("meta"),L=o(),d=s("p"),u=o(),h(w.$$.fragment),m=o(),y=s("p"),y.innerHTML=Qe,de=o(),J=s("p"),J.innerHTML=Ke,ce=o(),j=s("p"),j.textContent=et,ue=o(),C=s("p"),C.innerHTML=tt,fe=o(),B=s("p"),B.innerHTML=nt,ge=o(),V=s("p"),V.textContent=it,he=o(),W=s("ul"),W.innerHTML=ot,_e=o(),be=s("hr"),Te=o(),H=s("p"),H.innerHTML=at,xe=o(),U=s("p"),U.textContent=st,ve=o(),X=s("p"),X.innerHTML=rt,we=o(),Z=s("p"),Z.innerHTML=lt,ye=o(),D=s("p"),D.innerHTML=pt,Le=o(),G=s("p"),G.textContent=mt,Me=o(),N=s("ul"),N.innerHTML=dt,Ie=o(),A=s("p"),A.innerHTML=ct,$e=o(),h($.$$.fragment),Pe=o(),h(R.$$.fragment),ke=o(),q=s("p"),q.innerHTML=ut,Je=o(),F=s("p"),F.textContent=ft,je=o(),h(E.$$.fragment),Ce=o(),Y=s("p"),Y.innerHTML=gt,Be=o(),h(z.$$.fragment),Ve=o(),S=s("p"),S.textContent=ht,We=o(),h(O.$$.fragment),He=o(),h(Q.$$.fragment),Ue=o(),c=s("div"),h(K.$$.fragment),Ae=o(),oe=s("p"),oe.textContent=_t,Re=o(),ae=s("p"),ae.innerHTML=bt,qe=o(),M=s("div"),h(ee.$$.fragment),Fe=o(),se=s("p"),se.textContent=Tt,Ee=o(),h(P.$$.fragment),Ye=o(),k=s("div"),h(te.$$.fragment),ze=o(),re=s("p"),re.textContent=xt,Xe=o(),h(ne.$$.fragment),Ze=o(),le=s("p"),this.h()},l(e){const t=It("svelte-u9bgzb",document.head);l=r(t,"META",{name:!0,content:!0}),t.forEach(n),L=a(e),d=r(e,"P",{}),pe(d).forEach(n),u=a(e),_(w.$$.fragment,e),m=a(e),y=r(e,"P",{"data-svelte-h":!0}),p(y)!=="svelte-z320hd"&&(y.innerHTML=Qe),de=a(e),J=r(e,"P",{"data-svelte-h":!0}),p(J)!=="svelte-15xt6tn"&&(J.innerHTML=Ke),ce=a(e),j=r(e,"P",{"data-svelte-h":!0}),p(j)!=="svelte-1cwsb16"&&(j.textContent=et),ue=a(e),C=r(e,"P",{"data-svelte-h":!0}),p(C)!=="svelte-zv9btz"&&(C.innerHTML=tt),fe=a(e),B=r(e,"P",{"data-svelte-h":!0}),p(B)!=="svelte-1mx7f6l"&&(B.innerHTML=nt),ge=a(e),V=r(e,"P",{"data-svelte-h":!0}),p(V)!=="svelte-1vdt8mw"&&(V.textContent=it),he=a(e),W=r(e,"UL",{"data-svelte-h":!0}),p(W)!=="svelte-iwyd0p"&&(W.innerHTML=ot),_e=a(e),be=r(e,"HR",{}),Te=a(e),H=r(e,"P",{"data-svelte-h":!0}),p(H)!=="svelte-1sewgiu"&&(H.innerHTML=at),xe=a(e),U=r(e,"P",{"data-svelte-h":!0}),p(U)!=="svelte-1cwsb16"&&(U.textContent=st),ve=a(e),X=r(e,"P",{"data-svelte-h":!0}),p(X)!=="svelte-1n6lvl2"&&(X.innerHTML=rt),we=a(e),Z=r(e,"P",{"data-svelte-h":!0}),p(Z)!=="svelte-1d4lw4f"&&(Z.innerHTML=lt),ye=a(e),D=r(e,"P",{"data-svelte-h":!0}),p(D)!=="svelte-xfq9w9"&&(D.innerHTML=pt),Le=a(e),G=r(e,"P",{"data-svelte-h":!0}),p(G)!=="svelte-1imnafj"&&(G.textContent=mt),Me=a(e),N=r(e,"UL",{"data-svelte-h":!0}),p(N)!=="svelte-1cqh1e9"&&(N.innerHTML=dt),Ie=a(e),A=r(e,"P",{"data-svelte-h":!0}),p(A)!=="svelte-14qpphk"&&(A.innerHTML=ct),$e=a(e),_($.$$.fragment,e),Pe=a(e),_(R.$$.fragment,e),ke=a(e),q=r(e,"P",{"data-svelte-h":!0}),p(q)!=="svelte-iekg51"&&(q.innerHTML=ut),Je=a(e),F=r(e,"P",{"data-svelte-h":!0}),p(F)!=="svelte-jub7f1"&&(F.textContent=ft),je=a(e),_(E.$$.fragment,e),Ce=a(e),Y=r(e,"P",{"data-svelte-h":!0}),p(Y)!=="svelte-4294wb"&&(Y.innerHTML=gt),Be=a(e),_(z.$$.fragment,e),Ve=a(e),S=r(e,"P",{"data-svelte-h":!0}),p(S)!=="svelte-9i4prs"&&(S.textContent=ht),We=a(e),_(O.$$.fragment,e),He=a(e),_(Q.$$.fragment,e),Ue=a(e),c=r(e,"DIV",{class:!0});var g=pe(c);_(K.$$.fragment,g),Ae=a(g),oe=r(g,"P",{"data-svelte-h":!0}),p(oe)!=="svelte-1ennvvi"&&(oe.textContent=_t),Re=a(g),ae=r(g,"P",{"data-svelte-h":!0}),p(ae)!=="svelte-1xjy0xy"&&(ae.innerHTML=bt),qe=a(g),M=r(g,"DIV",{class:!0});var I=pe(M);_(ee.$$.fragment,I),Fe=a(I),se=r(I,"P",{"data-svelte-h":!0}),p(se)!=="svelte-v78lg8"&&(se.textContent=Tt),Ee=a(I),_(P.$$.fragment,I),I.forEach(n),Ye=a(g),k=r(g,"DIV",{class:!0});var Ge=pe(k);_(te.$$.fragment,Ge),ze=a(Ge),re=r(Ge,"P",{"data-svelte-h":!0}),p(re)!=="svelte-16q0ax1"&&(re.textContent=xt),Ge.forEach(n),g.forEach(n),Xe=a(e),_(ne.$$.fragment,e),Ze=a(e),le=r(e,"P",{}),pe(le).forEach(n),this.h()},h(){me(l,"name","hf:doc:metadata"),me(l,"content",Bt),me(M,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),me(k,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),me(c,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,t){f(document.head,l),i(e,L,t),i(e,d,t),i(e,u,t),b(w,e,t),i(e,m,t),i(e,y,t),i(e,de,t),i(e,J,t),i(e,ce,t),i(e,j,t),i(e,ue,t),i(e,C,t),i(e,fe,t),i(e,B,t),i(e,ge,t),i(e,V,t),i(e,he,t),i(e,W,t),i(e,_e,t),i(e,be,t),i(e,Te,t),i(e,H,t),i(e,xe,t),i(e,U,t),i(e,ve,t),i(e,X,t),i(e,we,t),i(e,Z,t),i(e,ye,t),i(e,D,t),i(e,Le,t),i(e,G,t),i(e,Me,t),i(e,N,t),i(e,Ie,t),i(e,A,t),i(e,$e,t),b($,e,t),i(e,Pe,t),b(R,e,t),i(e,ke,t),i(e,q,t),i(e,Je,t),i(e,F,t),i(e,je,t),b(E,e,t),i(e,Ce,t),i(e,Y,t),i(e,Be,t),b(z,e,t),i(e,Ve,t),i(e,S,t),i(e,We,t),b(O,e,t),i(e,He,t),b(Q,e,t),i(e,Ue,t),i(e,c,t),b(K,c,null),f(c,Ae),f(c,oe),f(c,Re),f(c,ae),f(c,qe),f(c,M),b(ee,M,null),f(M,Fe),f(M,se),f(M,Ee),b(P,M,null),f(c,Ye),f(c,k),b(te,k,null),f(k,ze),f(k,re),i(e,Xe,t),b(ne,e,t),i(e,Ze,t),i(e,le,t),De=!0},p(e,[t]){const g={};t&2&&(g.$$scope={dirty:t,ctx:e}),$.$set(g);const I={};t&2&&(I.$$scope={dirty:t,ctx:e}),P.$set(I)},i(e){De\|\|(T(w.$$.fragment,e),T($.$$.fragment,e),T(R.$$.fragment,e),T(E.$$.fragment,e),T(z.$$.fragment,e),T(O.$$.fragment,e),T(Q.$$.fragment,e),T(K.$$.fragment,e),T(ee.$$.fragment,e),T(P.$$.fragment,e),T(te.$$.fragment,e),T(ne.$$.fragment,e),De=!0)},o(e){x(w.$$.fragment,e),x($.$$.fragment,e),x(R.$$.fragment,e),x(E.$$.fragment,e),x(z.$$.fragment,e),x(O.$$.fragment,e),x(Q.$$.fragment,e),x(K.$$.fragment,e),x(ee.$$.fragment,e),x(P.$$.fragment,e),x(te.$$.fragment,e),x(ne.$$.fragment,e),De=!1},d(e){e&&(n(L),n(d),n(u),n(m),n(y),n(de),n(J),n(ce),n(j),n(ue),n(C),n(fe),n(B),n(ge),n(V),n(he),n(W),n(_e),n(be),n(Te),n(H),n(xe),n(U),n(ve),n(X),n(we),n(Z),n(ye),n(D),n(Le),n(G),n(Me),n(N),n(Ie),n(A),n($e),n(Pe),n(ke),n(q),n(Je),n(F),n(je),n(Ce),n(Y),n(Be),n(Ve),n(S),n(We),n(He),n(Ue),n(c),n(Xe),n(Ze),n(le)),n(l),v(w,e),v($,e),v(R,e),v(E,e),v(z,e),v(O,e),v(Q,e),v(K),v(ee),v(P),v(te),v(ne,e)}}}const Bt='{"title":"Lumina-T2X","local":"lumina-t2x","sections":[{"title":"Inference (Text-to-Image)","local":"inference-text-to-image","sections":[],"depth":3},{"title":"LuminaText2ImgPipeline","local":"diffusers.LuminaText2ImgPipeline","sections":[],"depth":2}],"depth":1}';function Vt(ie){return yt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Nt extends Lt{constructor(l){super(),Mt(this,l,Vt,Ct,wt,{})}}export{Nt as component};

Xet Storage Details

Size:: 36.4 kB
Xet hash:: 4580c51eaa76ee7a97eadb4b841643390f5b70839cb9fc2ce66273d6d78918c6

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.