Buckets:
| import{s as ce,n as pe,o as ge}from"../chunks/scheduler.53228c21.js";import{S as _e,i as ue,e as i,s as r,c as _,h as he,a as m,d as t,b as n,f as N,g as u,j as V,k as H,l as o,m as l,n as h,t as v,o as $,p as b}from"../chunks/index.100fac89.js";import{C as ve}from"../chunks/CopyLLMTxtMenu.356334a4.js";import{D as Q}from"../chunks/Docstring.34b3584e.js";import{H as fe,E as $e}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.c7e0d7fc.js";function be(oe){let f,j,S,q,T,O,x,U,y,se='A Transformer model for image-like data from <a href="https://huggingface.co/Tongyi-MAI/Z-Image-Turbo" rel="nofollow">Z-Image</a>.',F,M,G,s,I,X,d,D,Y,k,ie='The <a href="/docs/diffusers/pr_13732/en/api/models/z_image_transformer2d#diffusers.ZImageTransformer2DModel">ZImageTransformer2DModel</a> forward method.',ee,C,me=`Flow: patchify -> t_embed -> x_embed -> x_refine -> cap_embed -> cap_refine | |
| -> [siglip_embed -> siglip_refine] -> build_unified -> main_layers -> final_layer -> unpatchify`,te,c,w,ae,P,le="Patchify for basic mode: single image per batch item.",re,p,z,ne,L,de="Patchify for omni mode: multiple images per batch item with noise masks.",R,Z,W,A,B;return T=new ve({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),x=new fe({props:{title:"ZImageTransformer2DModel",local:"zimagetransformer2dmodel",headingTag:"h1"}}),M=new fe({props:{title:"ZImageTransformer2DModel",local:"diffusers.ZImageTransformer2DModel",headingTag:"h2"}}),I=new Q({props:{name:"class diffusers.ZImageTransformer2DModel",anchor:"diffusers.ZImageTransformer2DModel",parameters:[{name:"all_patch_size",val:" = (2,)"},{name:"all_f_patch_size",val:" = (1,)"},{name:"in_channels",val:" = 16"},{name:"dim",val:" = 3840"},{name:"n_layers",val:" = 30"},{name:"n_refiner_layers",val:" = 2"},{name:"n_heads",val:" = 30"},{name:"n_kv_heads",val:" = 30"},{name:"norm_eps",val:" = 1e-05"},{name:"qk_norm",val:" = True"},{name:"cap_feat_dim",val:" = 2560"},{name:"siglip_feat_dim",val:" = None"},{name:"rope_theta",val:" = 256.0"},{name:"t_scale",val:" = 1000.0"},{name:"axes_dims",val:" = [32, 48, 48]"},{name:"axes_lens",val:" = [1024, 512, 512]"}],source:"https://github.com/huggingface/diffusers/blob/vr_13732/src/diffusers/models/transformers/transformer_z_image.py#L359"}}),D=new Q({props:{name:"forward",anchor:"diffusers.ZImageTransformer2DModel.forward",parameters:[{name:"x",val:": list"},{name:"t",val:""},{name:"cap_feats",val:": list"},{name:"return_dict",val:": bool = True"},{name:"controlnet_block_samples",val:": dict[int, torch.Tensor] | None = None"},{name:"siglip_feats",val:": list[list[torch.Tensor]] | None = None"},{name:"image_noise_mask",val:": list[list[int]] | None = None"},{name:"patch_size",val:": int = 2"},{name:"f_patch_size",val:": int = 1"}],parametersDescription:[{anchor:"diffusers.ZImageTransformer2DModel.forward.x",description:`<strong>x</strong> (<code>list</code> of <code>torch.Tensor</code> or nested <code>list</code> of <code>torch.Tensor</code>) — | |
| Input latents. A flat list when running in standard mode, or a nested list when running in omni mode.`,name:"x"},{anchor:"diffusers.ZImageTransformer2DModel.forward.t",description:`<strong>t</strong> (<code>torch.Tensor</code>) — | |
| Used to indicate denoising step.`,name:"t"},{anchor:"diffusers.ZImageTransformer2DModel.forward.cap_feats",description:`<strong>cap_feats</strong> (<code>list</code> of <code>torch.Tensor</code> or nested <code>list</code> of <code>torch.Tensor</code>) — | |
| Conditional caption embeddings (embeddings computed from the input conditions such as prompts) to use.`,name:"cap_feats"},{anchor:"diffusers.ZImageTransformer2DModel.forward.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) — | |
| Whether or not to return a <code>~models.transformer_2d.Transformer2DModelOutput</code> instead of a plain | |
| tuple.`,name:"return_dict"},{anchor:"diffusers.ZImageTransformer2DModel.forward.controlnet_block_samples",description:`<strong>controlnet_block_samples</strong> (<code>dict</code> of <code>int</code> to <code>torch.Tensor</code>, <em>optional</em>) — | |
| A mapping from block index to tensor that if specified are added to the residuals of transformer | |
| blocks.`,name:"controlnet_block_samples"},{anchor:"diffusers.ZImageTransformer2DModel.forward.siglip_feats",description:`<strong>siglip_feats</strong> (<code>list</code> of <code>list</code> of <code>torch.Tensor</code>, <em>optional</em>) — | |
| Optional SigLIP image features used as additional conditioning.`,name:"siglip_feats"},{anchor:"diffusers.ZImageTransformer2DModel.forward.image_noise_mask",description:`<strong>image_noise_mask</strong> (<code>list</code> of <code>list</code> of <code>int</code>, <em>optional</em>) — | |
| Per-image noise masks indicating noisy vs. clean tokens in omni mode.`,name:"image_noise_mask"},{anchor:"diffusers.ZImageTransformer2DModel.forward.patch_size",description:`<strong>patch_size</strong> (<code>int</code>, <em>optional</em>, defaults to 2) — | |
| Spatial patch size used to patchify the input latents.`,name:"patch_size"},{anchor:"diffusers.ZImageTransformer2DModel.forward.f_patch_size",description:`<strong>f_patch_size</strong> (<code>int</code>, <em>optional</em>, defaults to 1) — | |
| Temporal patch size used to patchify the input latents.`,name:"f_patch_size"}],source:"https://github.com/huggingface/diffusers/blob/vr_13732/src/diffusers/models/transformers/transformer_z_image.py#L894"}}),w=new Q({props:{name:"patchify_and_embed",anchor:"diffusers.ZImageTransformer2DModel.patchify_and_embed",parameters:[{name:"all_image",val:": list"},{name:"all_cap_feats",val:": list"},{name:"patch_size",val:": int"},{name:"f_patch_size",val:": int"}],source:"https://github.com/huggingface/diffusers/blob/vr_13732/src/diffusers/models/transformers/transformer_z_image.py#L588"}}),z=new Q({props:{name:"patchify_and_embed_omni",anchor:"diffusers.ZImageTransformer2DModel.patchify_and_embed_omni",parameters:[{name:"all_x",val:": list"},{name:"all_cap_feats",val:": list"},{name:"all_siglip_feats",val:": list"},{name:"patch_size",val:": int"},{name:"f_patch_size",val:": int"},{name:"images_noise_mask",val:": list"}],source:"https://github.com/huggingface/diffusers/blob/vr_13732/src/diffusers/models/transformers/transformer_z_image.py#L625"}}),Z=new $e({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/models/z_image_transformer2d.md"}}),{c(){f=i("meta"),j=r(),S=i("p"),q=r(),_(T.$$.fragment),O=r(),_(x.$$.fragment),U=r(),y=i("p"),y.innerHTML=se,F=r(),_(M.$$.fragment),G=r(),s=i("div"),_(I.$$.fragment),X=r(),d=i("div"),_(D.$$.fragment),Y=r(),k=i("p"),k.innerHTML=ie,ee=r(),C=i("p"),C.textContent=me,te=r(),c=i("div"),_(w.$$.fragment),ae=r(),P=i("p"),P.textContent=le,re=r(),p=i("div"),_(z.$$.fragment),ne=r(),L=i("p"),L.textContent=de,R=r(),_(Z.$$.fragment),W=r(),A=i("p"),this.h()},l(e){const a=he("svelte-u9bgzb",document.head);f=m(a,"META",{name:!0,content:!0}),a.forEach(t),j=n(e),S=m(e,"P",{}),N(S).forEach(t),q=n(e),u(T.$$.fragment,e),O=n(e),u(x.$$.fragment,e),U=n(e),y=m(e,"P",{"data-svelte-h":!0}),V(y)!=="svelte-1x46gmc"&&(y.innerHTML=se),F=n(e),u(M.$$.fragment,e),G=n(e),s=m(e,"DIV",{class:!0});var g=N(s);u(I.$$.fragment,g),X=n(g),d=m(g,"DIV",{class:!0});var E=N(d);u(D.$$.fragment,E),Y=n(E),k=m(E,"P",{"data-svelte-h":!0}),V(k)!=="svelte-5kue8x"&&(k.innerHTML=ie),ee=n(E),C=m(E,"P",{"data-svelte-h":!0}),V(C)!=="svelte-1joj8qn"&&(C.textContent=me),E.forEach(t),te=n(g),c=m(g,"DIV",{class:!0});var J=N(c);u(w.$$.fragment,J),ae=n(J),P=m(J,"P",{"data-svelte-h":!0}),V(P)!=="svelte-bc1613"&&(P.textContent=le),J.forEach(t),re=n(g),p=m(g,"DIV",{class:!0});var K=N(p);u(z.$$.fragment,K),ne=n(K),L=m(K,"P",{"data-svelte-h":!0}),V(L)!=="svelte-1ug80iy"&&(L.textContent=de),K.forEach(t),g.forEach(t),R=n(e),u(Z.$$.fragment,e),W=n(e),A=m(e,"P",{}),N(A).forEach(t),this.h()},h(){H(f,"name","hf:doc:metadata"),H(f,"content",Te),H(d,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),H(c,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),H(p,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),H(s,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,a){o(document.head,f),l(e,j,a),l(e,S,a),l(e,q,a),h(T,e,a),l(e,O,a),h(x,e,a),l(e,U,a),l(e,y,a),l(e,F,a),h(M,e,a),l(e,G,a),l(e,s,a),h(I,s,null),o(s,X),o(s,d),h(D,d,null),o(d,Y),o(d,k),o(d,ee),o(d,C),o(s,te),o(s,c),h(w,c,null),o(c,ae),o(c,P),o(s,re),o(s,p),h(z,p,null),o(p,ne),o(p,L),l(e,R,a),h(Z,e,a),l(e,W,a),l(e,A,a),B=!0},p:pe,i(e){B||(v(T.$$.fragment,e),v(x.$$.fragment,e),v(M.$$.fragment,e),v(I.$$.fragment,e),v(D.$$.fragment,e),v(w.$$.fragment,e),v(z.$$.fragment,e),v(Z.$$.fragment,e),B=!0)},o(e){$(T.$$.fragment,e),$(x.$$.fragment,e),$(M.$$.fragment,e),$(I.$$.fragment,e),$(D.$$.fragment,e),$(w.$$.fragment,e),$(z.$$.fragment,e),$(Z.$$.fragment,e),B=!1},d(e){e&&(t(j),t(S),t(q),t(O),t(U),t(y),t(F),t(G),t(s),t(R),t(W),t(A)),t(f),b(T,e),b(x,e),b(M,e),b(I),b(D),b(w),b(z),b(Z,e)}}}const Te='{"title":"ZImageTransformer2DModel","local":"zimagetransformer2dmodel","sections":[{"title":"ZImageTransformer2DModel","local":"diffusers.ZImageTransformer2DModel","sections":[],"depth":2}],"depth":1}';function xe(oe){return ge(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class ze extends _e{constructor(f){super(),ue(this,f,xe,be,ce,{})}}export{ze as component}; | |
Xet Storage Details
- Size:
- 9.61 kB
- Xet hash:
- e4db93d6289be56c6e29d79779824cac54355278b85ca7bc09332a1cd00c4cef
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.