Spaces:

jbilcke-hf
/

FlashWorld-ZeroGPU

Runtime error

App Files Files Community

imlixinyang commited on Oct 16, 2025

Commit

c8df52d

1 Parent(s): 123eeba

add app!

Browse files

Files changed (13) hide show

.gitignore +4 -0
README.md +5 -3
app.py +651 -0
concurrency_manager.py +203 -0
index.html +2130 -0
models/__init__.py +5 -0
models/autoencoder_kl_wan.py +1467 -0
models/reconstruction_model.py +261 -0
models/render.py +138 -0
models/transformer_wan.py +601 -0
quant.py +195 -0
requirements.txt +19 -0
utils.py +531 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+tmpfiles/
+model.ckpt
+**/__pycache__/**

README.md CHANGED Viewed

@@ -38,6 +38,8 @@ pip install torch torchvision
 pip install triton transformers pytorch_lightning omegaconf ninja numpy jaxtyping rich tensorboard einops moviepy==1.0.3 webdataset accelerate opencv-python lpips av plyfile ftfy peft tensorboard pandas flask
 ```
 - install ```gsplat@1.5.2``` and ```diffusers@wan-5Bi2v``` packages
 ```
 pip install git+https://github.com/nerfstudio-project/gsplat.git@32f2a54d21c7ecb135320bb02b136b7407ae5712
@@ -55,9 +57,10 @@ cd FlashWorld
 python app.py
 ```
-Then, enjoy your journey in FlashWorld!
 ## More Generation Results
 [https://github.com/user-attachments/assets/bbdbe5de-5e15-4471-b380-4d8191688d82](https://github.com/user-attachments/assets/53d41748-4c35-48c4-9771-f458421c0b38)
@@ -67,7 +70,6 @@ Then, enjoy your journey in FlashWorld!
 Licensed under the CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International)
 The code is released for academic research use only.
 If you have any questions, please contact me via [imlixinyang@gmail.com](mailto:imlixinyang@gmail.com).

 pip install triton transformers pytorch_lightning omegaconf ninja numpy jaxtyping rich tensorboard einops moviepy==1.0.3 webdataset accelerate opencv-python lpips av plyfile ftfy peft tensorboard pandas flask
 ```
+Please refer to the `requirements.txt` file for the exact package versions.
 - install ```gsplat@1.5.2``` and ```diffusers@wan-5Bi2v``` packages
 ```
 pip install git+https://github.com/nerfstudio-project/gsplat.git@32f2a54d21c7ecb135320bb02b136b7407ae5712
 python app.py
 ```
+Then, open your web browser and navigate to ```http://HOST_IP:7860``` to start exploring FlashWorld!
+<!-- We also provide example trajectory josn files and input images in the `examples/` directory. -->
 ## More Generation Results
 [https://github.com/user-attachments/assets/bbdbe5de-5e15-4471-b380-4d8191688d82](https://github.com/user-attachments/assets/53d41748-4c35-48c4-9771-f458421c0b38)
 Licensed under the CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International)
 The code is released for academic research use only.
 If you have any questions, please contact me via [imlixinyang@gmail.com](mailto:imlixinyang@gmail.com).

app.py ADDED Viewed

	@@ -0,0 +1,651 @@

+try:
+    import spaces
+    GPU = spaces.GPU
+    print("spaces GPU is available")
+except ImportError:
+    def GPU(func):
+        return func
+import os
+import subprocess
+# def install_cuda_toolkit():
+#     # CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
+#     CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run"
+#     CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL)
+#     subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE])
+#     subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE])
+#     subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"])
+#     os.environ["CUDA_HOME"] = "/usr/local/cuda"
+#     os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"])
+#     os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % (
+#         os.environ["CUDA_HOME"],
+#         "" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"],
+#     )
+#     # Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range
+#     os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"
+#     print("Successfully installed CUDA toolkit at: ", os.environ["CUDA_HOME"])
+#     subprocess.call('rm /usr/bin/gcc', shell=True)
+#     subprocess.call('rm /usr/bin/g++', shell=True)
+#     subprocess.call('rm /usr/local/cuda/bin/gcc', shell=True)
+#     subprocess.call('rm /usr/local/cuda/bin/g++', shell=True)
+#     subprocess.call('ln -s /usr/bin/gcc-11 /usr/bin/gcc', shell=True)
+#     subprocess.call('ln -s /usr/bin/g++-11 /usr/bin/g++', shell=True)
+#     subprocess.call('ln -s /usr/bin/gcc-11 /usr/local/cuda/bin/gcc', shell=True)
+#     subprocess.call('ln -s /usr/bin/g++-11 /usr/local/cuda/bin/g++', shell=True)
+#     subprocess.call('gcc --version', shell=True)
+#     subprocess.call('g++ --version', shell=True)
+# install_cuda_toolkit()
+# subprocess.run('pip install git+https://github.com/nerfstudio-project/gsplat.git@32f2a54d21c7ecb135320bb02b136b7407ae5712 --no-build-isolation --use-pep517', env={'CUDA_HOME': "/usr/local/cuda", "TORCH_CUDA_ARCH_LIST": "8.0;8.6"}, shell=True)
+from flask import Flask, jsonify, request, send_file, render_template
+import base64
+import io
+from PIL import Image
+import torch
+import numpy as np
+import os
+import argparse
+import imageio
+import json
+import time
+import threading
+from concurrency_manager import ConcurrencyManager
+from huggingface_hub import hf_hub_download
+import einops
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import imageio
+from models import *
+from utils import *
+from transformers import T5TokenizerFast, UMT5EncoderModel
+from diffusers import FlowMatchEulerDiscreteScheduler
+class MyFlowMatchEulerDiscreteScheduler(FlowMatchEulerDiscreteScheduler):
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        return torch.argmin(
+            (timestep - schedule_timesteps.to(timestep.device)).abs(), dim=0).item()
+class GenerationSystem(nn.Module):
+    def __init__(self, ckpt_path=None, device="cuda:0", offload_t5=False, offload_vae=False):
+        super().__init__()
+        self.device = device
+        self.offload_t5 = offload_t5
+        self.offload_vae = offload_vae
+        self.latent_dim = 48
+        self.temporal_downsample_factor = 4
+        self.spatial_downsample_factor = 16
+        self.feat_dim = 1024
+        self.latent_patch_size = 2
+        self.denoising_steps = [0, 250, 500, 750]
+        model_id = "Wan-AI/Wan2.2-TI2V-5B-Diffusers"
+        self.vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float).eval()
+        from models.autoencoder_kl_wan import WanCausalConv3d
+        with torch.no_grad():
+            for name, module in self.vae.named_modules():
+                if isinstance(module, WanCausalConv3d):
+                    time_pad = module._padding[4]
+                    module.padding = (0, module._padding[2], module._padding[0])
+                    module._padding = (0, 0, 0, 0, 0, 0)
+                    module.weight = torch.nn.Parameter(module.weight[:, :, time_pad:].clone())
+        self.vae.requires_grad_(False)
+        self.register_buffer('latents_mean', torch.tensor(self.vae.config.latents_mean).float().view(1, self.vae.config.z_dim, 1, 1, 1).to(self.device))
+        self.register_buffer('latents_std', torch.tensor(self.vae.config.latents_std).float().view(1, self.vae.config.z_dim, 1, 1, 1).to(self.device))
+        self.latent_scale_fn = lambda x: (x - self.latents_mean) / self.latents_std
+        self.latent_unscale_fn = lambda x: x * self.latents_std + self.latents_mean
+        self.tokenizer = T5TokenizerFast.from_pretrained(model_id, subfolder="tokenizer")
+        self.text_encoder = UMT5EncoderModel.from_pretrained(model_id, subfolder="text_encoder", torch_dtype=torch.float32).eval().requires_grad_(False).to(self.device if not self.offload_t5 else "cpu")
+        self.transformer = WanTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.float32).train().requires_grad_(False)
+        self.transformer.patch_embedding.weight = nn.Parameter(F.pad(self.transformer.patch_embedding.weight, (0, 0, 0, 0, 0, 0, 0, 6 + self.latent_dim)))
+        # self.transformer.rope.freqs_f[:] = self.transformer.rope.freqs_f[:1]
+        weight = self.transformer.proj_out.weight.reshape(self.latent_patch_size ** 2, self.latent_dim, self.transformer.proj_out.weight.shape[1])
+        bias = self.transformer.proj_out.bias.reshape(self.latent_patch_size ** 2, self.latent_dim)
+        extra_weight = torch.randn(self.latent_patch_size ** 2, self.feat_dim, self.transformer.proj_out.weight.shape[1]) * 0.02
+        extra_bias = torch.zeros(self.latent_patch_size ** 2, self.feat_dim)
+        self.transformer.proj_out.weight = nn.Parameter(torch.cat([weight, extra_weight], dim=1).flatten(0, 1).detach().clone())
+        self.transformer.proj_out.bias = nn.Parameter(torch.cat([bias, extra_bias], dim=1).flatten(0, 1).detach().clone())
+        self.recon_decoder = WANDecoderPixelAligned3DGSReconstructionModel(self.vae, self.feat_dim, use_render_checkpointing=True, use_network_checkpointing=False).train().requires_grad_(False).to(self.device)
+        self.scheduler = MyFlowMatchEulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler", shift=3)
+        self.register_buffer('timesteps', self.scheduler.timesteps.clone().to(self.device))
+        self.transformer.disable_gradient_checkpointing()
+        self.transformer.gradient_checkpointing = False
+        self.add_feedback_for_transformer()
+        if ckpt_path is not None:
+            state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=False)
+            self.transformer.load_state_dict(state_dict["transformer"])
+            self.recon_decoder.load_state_dict(state_dict["recon_decoder"])
+            print(f"Loaded {ckpt_path}.")
+        from quant import FluxFp8GeMMProcessor
+        FluxFp8GeMMProcessor(self.transformer)
+        del self.vae.post_quant_conv, self.vae.decoder
+        self.vae.to(self.device if not self.offload_vae else "cpu")
+        self.transformer.to(self.device)
+    def add_feedback_for_transformer(self):
+        self.use_feedback = True
+        self.transformer.patch_embedding.weight = nn.Parameter(F.pad(self.transformer.patch_embedding.weight, (0, 0, 0, 0, 0, 0, 0, self.feat_dim + self.latent_dim)))
+    def encode_text(self, texts):
+        max_sequence_length = 512
+        text_inputs = self.tokenizer(
+            texts,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        if getattr(self, "offload_t5", False):
+            text_input_ids = text_inputs.input_ids.to("cpu")
+            mask = text_inputs.attention_mask.to("cpu")
+        else:
+            text_input_ids = text_inputs.input_ids.to(self.device)
+            mask = text_inputs.attention_mask.to(self.device)
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        if getattr(self, "offload_t5", False):
+            with torch.no_grad():
+                text_embeds = self.text_encoder(text_input_ids, mask).last_hidden_state.to(self.device)
+        else:
+            text_embeds = self.text_encoder(text_input_ids, mask).last_hidden_state
+        text_embeds = [u[:v] for u, v in zip(text_embeds, seq_lens)]
+        text_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in text_embeds], dim=0
+        )
+        return text_embeds.float()
+    def forward_generator(self, noisy_latents, raymaps, condition_latents, t, text_embeds, cameras, render_cameras, image_height, image_width, need_3d_mode=True):
+        out = self.transformer(
+            hidden_states=torch.cat([noisy_latents, raymaps, condition_latents], dim=1),
+            timestep=t,
+            encoder_hidden_states=text_embeds,
+            return_dict=False,
+        )[0]
+        v_pred, feats = out.split([self.latent_dim, self.feat_dim], dim=1)
+        sigma = torch.stack([self.scheduler.sigmas[self.scheduler.index_for_timestep(_t)] for _t in t.unbind(0)], dim=0).to(self.device)
+        latents_pred_2d = noisy_latents - sigma * v_pred
+        if need_3d_mode:
+            scene_params = self.recon_decoder(
+                                einops.rearrange(feats, 'B C T H W -> (B T) C H W').unsqueeze(2),
+                                einops.rearrange(self.latent_unscale_fn(latents_pred_2d.detach()), 'B C T H W -> (B T) C H W').unsqueeze(2),
+                                cameras
+                            ).flatten(1, -2)
+            images_pred, _ = self.recon_decoder.render(scene_params.unbind(0), render_cameras, image_height, image_width, bg_mode="white")
+            latents_pred_3d = einops.rearrange(self.latent_scale_fn(self.vae.encode(
+                            einops.rearrange(images_pred, 'B T C H W -> (B T) C H W', T=images_pred.shape[1]).unsqueeze(2).to(self.device if not self.offload_vae else "cpu").float()
+                        ).latent_dist.sample().to(self.device)).squeeze(2), '(B T) C H W -> B C T H W', T=images_pred.shape[1]).to(noisy_latents.dtype)
+        return {
+            '2d': latents_pred_2d,
+            '3d': latents_pred_3d if need_3d_mode else None,
+            'rgb_3d': images_pred if need_3d_mode else None,
+            'scene': scene_params if need_3d_mode else None,
+            'feat': feats
+        }
+    @torch.no_grad()
+    @torch.amp.autocast(dtype=torch.bfloat16, device_type="cuda")
+    def generate(self, cameras, n_frame, image=None, text="", image_index=0, image_height=480, image_width=704, video_output_path=None):
+        with torch.no_grad():
+            batch_size = 1
+            cameras = cameras.to(self.device).unsqueeze(0)
+            if cameras.shape[1] != n_frame:
+                render_cameras = cameras.clone()
+                cameras = sample_from_dense_cameras(cameras.squeeze(0), torch.linspace(0, 1, n_frame, device=self.device)).unsqueeze(0)
+            else:
+                render_cameras = cameras
+            cameras, ref_w2c, T_norm = normalize_cameras(cameras, return_meta=True, n_frame=None)
+            render_cameras = normalize_cameras(render_cameras, ref_w2c=ref_w2c, T_norm=T_norm, n_frame=None)
+            text = "[Static] " + text
+            text_embeds = self.encode_text([text])
+            # neg_text_embeds = self.encode_text([""]).repeat(batch_size, 1, 1)
+            masks = torch.zeros(batch_size, n_frame, device=self.device)
+            condition_latents = torch.zeros(batch_size, self.latent_dim, n_frame, image_height // self.spatial_downsample_factor, image_width // self.spatial_downsample_factor, device=self.device)
+            if image is not None:
+                image = image.to(self.device)
+                latent = self.latent_scale_fn(self.vae.encode(
+                        image.unsqueeze(0).unsqueeze(2).to(self.device if not self.offload_vae else "cpu").float()
+                    ).latent_dist.sample().to(self.device)).squeeze(2)
+                masks[:, image_index] = 1
+                condition_latents[:, :, image_index] = latent
+            raymaps = create_raymaps(cameras, image_height // self.spatial_downsample_factor, image_width // self.spatial_downsample_factor)
+            raymaps = einops.rearrange(raymaps, 'B T H W C -> B C T H W', T=n_frame)
+            noise = torch.randn(batch_size, self.latent_dim, n_frame, image_height // self.spatial_downsample_factor, image_width // self.spatial_downsample_factor, device=self.device)
+            noisy_latents = noise
+            torch.cuda.empty_cache()
+            if self.use_feedback:
+                prev_latents_pred = torch.zeros(batch_size, self.latent_dim, n_frame, image_height // self.spatial_downsample_factor, image_width // self.spatial_downsample_factor, device=self.device)
+                prev_feats = torch.zeros(batch_size, self.feat_dim, n_frame, image_height // self.spatial_downsample_factor, image_width // self.spatial_downsample_factor, device=self.device)
+            for i in range(len(self.denoising_steps)):
+                t_ids = torch.full((noisy_latents.shape[0],), self.denoising_steps[i], device=self.device)
+                t = self.timesteps[t_ids]
+                if self.use_feedback:
+                    _condition_latents = torch.cat([condition_latents, prev_feats, prev_latents_pred], dim=1)
+                else:
+                    _condition_latents = condition_latents
+                if i < len(self.denoising_steps) - 1:
+                    out = self.forward_generator(noisy_latents, raymaps, _condition_latents, t, text_embeds, cameras, cameras, image_height, image_width, need_3d_mode=True)
+                    latents_pred = out["3d"]
+                    if self.use_feedback:
+                        prev_latents_pred = latents_pred
+                        prev_feats = out['feat']
+                    noisy_latents = self.scheduler.scale_noise(latents_pred, self.timesteps[torch.full((noisy_latents.shape[0],), self.denoising_steps[i + 1], device=self.device)], torch.randn_like(noise))
+                else:
+                    out = self.transformer(
+                        hidden_states=torch.cat([noisy_latents, raymaps, _condition_latents], dim=1),
+                        timestep=t,
+                        encoder_hidden_states=text_embeds,
+                        return_dict=False,
+                    )[0]
+                    v_pred, feats = out.split([self.latent_dim, self.feat_dim], dim=1)
+                    sigma = torch.stack([self.scheduler.sigmas[self.scheduler.index_for_timestep(_t)] for _t in t.unbind(0)], dim=0).to(self.device)
+                    latents_pred = noisy_latents - sigma * v_pred
+                    scene_params = self.recon_decoder(
+                                        einops.rearrange(feats, 'B C T H W -> (B T) C H W').unsqueeze(2),
+                                        einops.rearrange(self.latent_unscale_fn(latents_pred.detach()), 'B C T H W -> (B T) C H W').unsqueeze(2),
+                                        cameras
+                                    ).flatten(1, -2)
+            if video_output_path is not None:
+                interpolated_images_pred, _ = self.recon_decoder.render(scene_params.unbind(0), render_cameras, image_height, image_width, bg_mode="white")
+                interpolated_images_pred = einops.rearrange(interpolated_images_pred[0].clamp(-1, 1).add(1).div(2), 'T C H W -> T H W C')
+                interpolated_images_pred = [torch.cat([img], dim=1).detach().cpu().mul(255).numpy().astype(np.uint8) for i, img in enumerate(interpolated_images_pred.unbind(0))]
+                imageio.mimwrite(video_output_path, interpolated_images_pred, fps=15, quality=8, macro_block_size=1)
+        scene_params = scene_params[0]
+        scene_params = scene_params.detach().cpu()
+        return scene_params, ref_w2c, T_norm
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--port', type=int, default=7860)
+    parser.add_argument("--ckpt", default=None)
+    parser.add_argument("--gpu", type=int, default=0)
+    parser.add_argument("--cache_dir", type=str, default="./tmpfiles")
+    parser.add_argument("--offload_t5", type=bool, default=False)
+    parser.add_argument("--max_concurrent", type=int, default=1, help="Maximum concurrent generation tasks")
+    args, _ = parser.parse_known_args()
+    # Ensure model.ckpt exists, download if not present
+    if args.ckpt is None:
+        from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+        ckpt_path = os.path.join(HUGGINGFACE_HUB_CACHE, "models--imlixinyang--FlashWorld", "snapshots", "6a8e88c6f88678ac098e4c82675f0aee555d6e5d", "model.ckpt")
+        if not os.path.exists(ckpt_path):
+            hf_hub_download(repo_id="imlixinyang/FlashWorld", filename="model.ckpt", local_dir_use_symlinks=False)
+    else:
+        ckpt_path = args.ckpt
+    app = Flask(__name__)
+    # 初始化GenerationSystem
+    device = f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu"
+    generation_system = GenerationSystem(ckpt_path=ckpt_path, device=device)
+    # 初始化并发管理器
+    concurrency_manager = ConcurrencyManager(max_concurrent=args.max_concurrent)
+    @app.after_request
+    def after_request(response):
+        response.headers.add('Access-Control-Allow-Origin', '*')
+        response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
+        response.headers.add('Access-Control-Allow-Methods', 'GET,PUT,POST,DELETE,OPTIONS')
+        return response
+    @GPU
+    def generate_wrapper(cameras, n_frame, image, text_prompt, image_index, image_height, image_width, video_output_path=None):
+        """生成函数的包装器，用于并发控制"""
+        return generation_system.generate(cameras, n_frame, image, text_prompt, image_index, image_height, image_width, video_output_path)
+    def job_generate(file_id, cache_dir, payload):
+        """工作线程执行的生成任务：负责生成并落盘，返回可下载信息"""
+        # 解包参数
+        cameras = payload["cameras"]
+        n_frame = payload["n_frame"]
+        image = payload["image"]
+        text_prompt = payload["text_prompt"]
+        image_index = payload["image_index"]
+        image_height = payload["image_height"]
+        image_width = payload["image_width"]
+        data = payload["raw_request"]
+        # 执行生成
+        scene_params, ref_w2c, T_norm = generation_system.generate(
+            cameras, n_frame, image, text_prompt, image_index, image_height, image_width, video_output_path=None
+        )
+        # 保存请求元数据
+        with open(os.path.join(cache_dir, f'{file_id}.json'), 'w') as f:
+            json.dump(data, f)
+        # 导出PLY文件
+        splat_path = os.path.join(cache_dir, f'{file_id}.ply')
+        export_ply_for_gaussians(splat_path, scene_params, opacity_threshold=0.001, T_norm=T_norm)
+        file_size = os.path.getsize(splat_path) if os.path.exists(splat_path) else 0
+        return {
+            'file_id': file_id,
+            'file_path': splat_path,
+            'file_size': file_size,
+            'download_url': f'/download/{file_id}'
+        }
+    @app.route('/generate', methods=['POST', 'OPTIONS'])
+    def generate():
+        # Handle preflight request
+        if request.method == 'OPTIONS':
+            return jsonify({'status': 'ok'})
+        try:
+            data = request.get_json(force=True)
+            image_prompt = data.get('image_prompt', None)
+            text_prompt = data.get('text_prompt', "")
+            cameras = data.get('cameras')
+            resolution = data.get('resolution')
+            image_index = data.get('image_index', 0)
+            n_frame, image_height, image_width = resolution
+            if not image_prompt and text_prompt == "":
+                return jsonify({'error': 'No Prompts provided'}), 400
+            # 处理图像
+            if image_prompt:
+                # image_prompt可以是路径和base64
+                if os.path.exists(image_prompt):
+                    image_prompt = Image.open(image_prompt)
+                else:
+                    # image_prompt 可能是 "data:image/png;base64,...."
+                    if ',' in image_prompt:
+                        image_prompt = image_prompt.split(',', 1)[1]
+                    try:
+                        image_bytes = base64.b64decode(image_prompt)
+                        image_prompt = Image.open(io.BytesIO(image_bytes))
+                    except Exception as img_e:
+                        return jsonify({'error': f'Image decode error: {str(img_e)}'}), 400
+                image = image_prompt.convert('RGB')
+                w, h = image.size
+                # center crop
+                if image_height / h > image_width / w:
+                    scale = image_height / h
+                else:
+                    scale = image_width / w
+                new_h = int(image_height / scale)
+                new_w = int(image_width / scale)
+                image = image.crop(((w - new_w) // 2, (h - new_h) // 2,
+                                    new_w + (w - new_w) // 2, new_h + (h - new_h) // 2)).resize((image_width, image_height))
+                for camera in cameras:
+                    camera['fx'] = camera['fx'] * scale
+                    camera['fy'] = camera['fy'] * scale
+                    camera['cx'] = (camera['cx'] - (w - new_w) // 2) * scale
+                    camera['cy'] = (camera['cy'] - (h - new_h) // 2) * scale
+                image = torch.from_numpy(np.array(image)).float().permute(2, 0, 1) / 255.0 * 2 - 1
+            else:
+                image = None
+            cameras = torch.stack([
+                torch.from_numpy(np.array([camera['quaternion'][0], camera['quaternion'][1], camera['quaternion'][2], camera['quaternion'][3], camera['position'][0], camera['position'][1], camera['position'][2], camera['fx'] / image_width, camera['fy'] / image_height, camera['cx'] / image_width, camera['cy'] / image_height], dtype=np.float32))
+                for camera in cameras
+            ], dim=0)
+            file_id = str(int(time.time() * 1000))
+            # 组装任务参数，推迟执行与落盘到工作线程中
+            payload = {
+                'cameras': cameras,
+                'n_frame': n_frame,
+                'image': image,
+                'text_prompt': text_prompt,
+                'image_index': image_index,
+                'image_height': image_height,
+                'image_width': image_width,
+                'raw_request': data,
+            }
+            # 提交任务到并发管理器（异步）
+            task_id = concurrency_manager.submit_task(
+                job_generate, file_id, args.cache_dir, payload
+            )
+            # 提交后立即返回队列信息
+            queue_status = concurrency_manager.get_queue_status()
+            queued_tasks = queue_status.get('queued_tasks', [])
+            try:
+                queue_position = queued_tasks.index(task_id) + 1
+            except ValueError:
+                # 如果任务已被工作线程立即领取，则认为已开始执行，位置为 0
+                queue_position = 0
+            return jsonify({
+                'success': True,
+                'task_id': task_id,
+                'file_id': file_id,
+                'queue': {
+                    'queued_count': queue_status.get('queued_count', 0),
+                    'running_count': queue_status.get('running_count', 0),
+                    'position': queue_position
+                }
+            }), 202
+        except Exception as e:
+            return jsonify({'error': f'Server error: {str(e)}'}), 500
+    @app.route('/download/<file_id>', methods=['GET'])
+    def download_file(file_id):
+        """下载生成的PLY文件"""
+        file_path = os.path.join(args.cache_dir, f'{file_id}.ply')
+        if not os.path.exists(file_path):
+            return jsonify({'error': 'File not found'}), 404
+        return send_file(file_path, as_attachment=True, download_name=f'{file_id}.ply')
+    @app.route('/delete/<file_id>', methods=['DELETE', 'POST', 'OPTIONS'])
+    def delete_file_endpoint(file_id):
+        """删除生成的文件及其元数据（由前端在下载完成后调用）"""
+        # CORS preflight
+        if request.method == 'OPTIONS':
+            return jsonify({'status': 'ok'})
+        try:
+            ply_path = os.path.join(args.cache_dir, f'{file_id}.ply')
+            json_path = os.path.join(args.cache_dir, f'{file_id}.json')
+            deleted = []
+            for path in [ply_path, json_path]:
+                if os.path.exists(path):
+                    os.remove(path)
+                    deleted.append(os.path.basename(path))
+            return jsonify({'success': True, 'deleted': deleted})
+        except Exception as e:
+            return jsonify({'success': False, 'error': str(e)}), 500
+    @app.route('/status', methods=['GET'])
+    def get_status():
+        """获取系统状态和队列信息"""
+        try:
+            queue_status = concurrency_manager.get_queue_status()
+            return jsonify({
+                'success': True,
+                'status': queue_status,
+                'timestamp': time.time()
+            })
+        except Exception as e:
+            return jsonify({'error': f'Failed to get status: {str(e)}'}), 500
+    @app.route('/task/<task_id>', methods=['GET'])
+    def get_task_status(task_id):
+        """获取特定任务的状态（包含排队位置和完成后的文件信息）"""
+        try:
+            task = concurrency_manager.get_task_status(task_id)
+            if not task:
+                return jsonify({'error': 'Task not found'}), 404
+            queue_status = concurrency_manager.get_queue_status()
+            queued_tasks = queue_status.get('queued_tasks', [])
+            try:
+                queue_position = queued_tasks.index(task_id) + 1
+            except ValueError:
+                queue_position = 0
+            resp = {
+                'success': True,
+                'task_id': task_id,
+                'status': task.status.value,
+                'created_at': task.created_at,
+                'started_at': task.started_at,
+                'completed_at': task.completed_at,
+                'error': task.error,
+                'queue': {
+                    'queued_count': queue_status.get('queued_count', 0),
+                    'running_count': queue_status.get('running_count', 0),
+                    'position': queue_position
+                }
+            }
+            if task.status.value == 'completed' and isinstance(task.result, dict):
+                resp.update({
+                    'file_id': task.result.get('file_id'),
+                    'file_path': task.result.get('file_path'),
+                    'file_size': task.result.get('file_size'),
+                    'download_url': task.result.get('download_url'),
+                    'generation_time': (task.completed_at - task.started_at)
+                })
+                # 更新task状态
+            return jsonify(resp)
+        except Exception as e:
+            return jsonify({'error': f'Failed to get task status: {str(e)}'}), 500
+    @app.route("/")
+    def index():
+        return send_file("index.html")
+    os.makedirs(args.cache_dir, exist_ok=True)
+    # 后台定时清理：删除超过30分钟未访问/修改的缓存文件
+    def cleanup_worker(cache_dir: str, max_age_seconds: int = 1800, interval_seconds: int = 300):
+        while True:
+            try:
+                now = time.time()
+                for name in os.listdir(cache_dir):
+                    # 只清理与任务相关的 .ply/.json 文件
+                    if not (name.endswith('.ply') or name.endswith('.json')):
+                        continue
+                    path = os.path.join(cache_dir, name)
+                    try:
+                        mtime = os.path.getmtime(path)
+                        if now - mtime > max_age_seconds:
+                            os.remove(path)
+                    except FileNotFoundError:
+                        pass
+                    except Exception:
+                        # 忽略单个文件的异常，继续清理
+                        pass
+            except Exception:
+                # 防止线程因异常退出
+                pass
+            time.sleep(interval_seconds)
+    cleaner_thread = threading.Thread(target=cleanup_worker, args=(args.cache_dir,), daemon=True)
+    cleaner_thread.start()
+    app.run(host='0.0.0.0', port=args.port)

concurrency_manager.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import threading
+import time
+import uuid
+from typing import Dict, List, Optional, Callable, Any
+from dataclasses import dataclass
+from enum import Enum
+class TaskStatus(Enum):
+    QUEUED = "queued"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+@dataclass
+class Task:
+    task_id: str
+    status: TaskStatus
+    created_at: float
+    started_at: Optional[float] = None
+    completed_at: Optional[float] = None
+    result: Optional[Any] = None
+    error: Optional[str] = None
+    function: Optional[Callable] = None
+    args: tuple = ()
+    kwargs: dict = None
+    def __post_init__(self):
+        if self.kwargs is None:
+            self.kwargs = {}
+class ConcurrencyManager:
+    def __init__(self, max_concurrent: int = 2):
+        """
+        并发控制管理器
+        Args:
+            max_concurrent: 最大并发数量
+        """
+        self.max_concurrent = max_concurrent
+        self.running_tasks: Dict[str, Task] = {}
+        self.queued_tasks: List[Task] = []
+        self.completed_tasks: Dict[str, Task] = {}
+        self.lock = threading.RLock()
+        self.worker_threads: List[threading.Thread] = []
+        self.shutdown_event = threading.Event()
+        # 启动工作线程
+        self._start_workers()
+    def _start_workers(self):
+        """启动工作线程"""
+        for i in range(self.max_concurrent):
+            worker = threading.Thread(target=self._worker_loop, daemon=True)
+            worker.start()
+            self.worker_threads.append(worker)
+    def _worker_loop(self):
+        """工作线程主循环"""
+        while not self.shutdown_event.is_set():
+            try:
+                task = self._get_next_task()
+                if task:
+                    self._execute_task(task)
+                else:
+                    # 没有任务时短暂休眠
+                    time.sleep(0.1)
+            except Exception as e:
+                print(f"Worker thread error: {e}")
+                time.sleep(1)
+    def _get_next_task(self) -> Optional[Task]:
+        """获取下一个要执行的任务"""
+        with self.lock:
+            if self.queued_tasks:
+                return self.queued_tasks.pop(0)
+            return None
+    def _execute_task(self, task: Task):
+        """执行任务"""
+        try:
+            with self.lock:
+                task.status = TaskStatus.RUNNING
+                task.started_at = time.time()
+                self.running_tasks[task.task_id] = task
+            # 执行任务
+            if task.function:
+                result = task.function(*task.args, **task.kwargs)
+                task.result = result
+            # 标记完成
+            with self.lock:
+                task.status = TaskStatus.COMPLETED
+                task.completed_at = time.time()
+                self.completed_tasks[task.task_id] = task
+                if task.task_id in self.running_tasks:
+                    del self.running_tasks[task.task_id]
+        except Exception as e:
+            # 标记失败
+            with self.lock:
+                task.status = TaskStatus.FAILED
+                task.completed_at = time.time()
+                task.error = str(e)
+                self.completed_tasks[task.task_id] = task
+                if task.task_id in self.running_tasks:
+                    del self.running_tasks[task.task_id]
+    def submit_task(self, func: Callable, *args, **kwargs) -> str:
+        """
+        提交任务
+        Args:
+            func: 要执行的函数
+            *args: 函数参数
+            **kwargs: 函数关键字参数
+        Returns:
+            task_id: 任务ID
+        """
+        task_id = str(uuid.uuid4())
+        task = Task(
+            task_id=task_id,
+            status=TaskStatus.QUEUED,
+            created_at=time.time(),
+            function=func,
+            args=args,
+            kwargs=kwargs
+        )
+        with self.lock:
+            self.queued_tasks.append(task)
+        return task_id
+    def get_task_status(self, task_id: str) -> Optional[Task]:
+        """获取任务状态"""
+        with self.lock:
+            if task_id in self.running_tasks:
+                return self.running_tasks[task_id]
+            elif task_id in self.completed_tasks:
+                return self.completed_tasks[task_id]
+            else:
+                # 检查队列中的任务
+                for task in self.queued_tasks:
+                    if task.task_id == task_id:
+                        return task
+        return None
+    def get_queue_status(self) -> Dict[str, Any]:
+        """获取队列状态"""
+        with self.lock:
+            return {
+                "max_concurrent": self.max_concurrent,
+                "running_count": len(self.running_tasks),
+                "queued_count": len(self.queued_tasks),
+                "completed_count": len(self.completed_tasks),
+                "running_tasks": [task.task_id for task in self.running_tasks.values()],
+                "queued_tasks": [task.task_id for task in self.queued_tasks],
+            }
+    def wait_for_task(self, task_id: str, timeout: Optional[float] = None) -> Task:
+        """
+        等待任务完成
+        Args:
+            task_id: 任务ID
+            timeout: 超时时间（秒），None表示无限等待
+        Returns:
+            Task: 完成的任务
+        """
+        start_time = time.time()
+        while True:
+            task = self.get_task_status(task_id)
+            if task and task.status in [TaskStatus.COMPLETED, TaskStatus.FAILED]:
+                return task
+            if timeout and (time.time() - start_time) > timeout:
+                raise TimeoutError(f"Task {task_id} timed out after {timeout} seconds")
+            time.sleep(0.1)
+    def cleanup_old_tasks(self, max_age_hours: int = 24):
+        """清理旧任务"""
+        current_time = time.time()
+        max_age_seconds = max_age_hours * 3600
+        with self.lock:
+            # 清理已完成的任务
+            old_tasks = [
+                task_id for task_id, task in self.completed_tasks.items()
+                if current_time - task.completed_at > max_age_seconds
+            ]
+            for task_id in old_tasks:
+                del self.completed_tasks[task_id]
+    def shutdown(self):
+        """关闭管理器"""
+        self.shutdown_event.set()
+        for worker in self.worker_threads:
+            worker.join(timeout=5)

index.html ADDED Viewed

	@@ -0,0 +1,2130 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>FlashWorld Demo</title>
+    <meta name="description" content="">
+    <style>
+        body {
+            margin: 0;
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            background: #1a1a1a;
+            color: #ffffff;
+            overflow: hidden;
+        }
+        .main-container {
+            display: flex;
+            height: 100vh;
+            flex-direction: column;
+        }
+        .header {
+            background: rgba(0, 0, 0, 0.8);
+            padding: 15px 20px;
+            text-align: center;
+            border-bottom: 1px solid rgba(255, 255, 255, 0.1);
+            flex-shrink: 0;
+        }
+        .header h1 {
+            margin: 0;
+            color: white;
+            font-size: 1.8em;
+            font-weight: 600;
+            margin-bottom: 8px;
+        }
+        .header-title-wrap {
+            display: inline-flex;
+            align-items: center;
+            gap: 8px;
+            position: relative;
+        }
+        .header-links {
+            display: flex;
+            justify-content: center;
+            gap: 20px;
+            margin-top: 8px;
+        }
+        .header-links a {
+            color: #60a5fa;
+            text-decoration: none;
+            font-size: 0.9em;
+            padding: 5px 10px;
+            border: 1px solid #60a5fa;
+            border-radius: 5px;
+            transition: all 0.3s ease;
+        }
+        .header-links a:hover {
+            background: #60a5fa;
+            color: white;
+        }
+        .content-container {
+            display: flex;
+            flex: 1;
+            overflow: hidden;
+        }
+        .left-panel {
+            width: 280px;
+            background: rgba(0, 0, 0, 0.7);
+            border-right: 1px solid rgba(255, 255, 255, 0.1);
+            padding: 20px;
+            overflow-y: auto;
+            flex-shrink: 0;
+        }
+        .center-panel {
+            flex: 1;
+            position: relative;
+            background: #000;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+        }
+        .right-panel {
+            width: 300px;
+            background: rgba(0, 0, 0, 0.7);
+            border-left: 1px solid rgba(255, 255, 255, 0.1);
+            padding: 20px;
+            overflow-y: auto;
+            flex-shrink: 0;
+        }
+        .guidance {
+            color: #e5e7eb;
+        }
+        .guidance h2 {
+            color: #ffffff;
+            margin-top: 0;
+            font-size: 1.3em;
+            border-bottom: 2px solid #60a5fa;
+            padding-bottom: 8px;
+            margin-bottom: 20px;
+        }
+        .gui-container h2{
+            color: #ffffff;
+            margin-top: 0;
+            font-size: 1.3em;
+            border-bottom: 2px solid #60fae5;
+            padding-bottom: 8px;
+            margin-bottom: 20px;
+        }
+        .step {
+            margin: 12px 0;
+            padding: 12px;
+            background: rgba(96, 165, 250, 0.1);
+            border-radius: 6px;
+            border-left: 3px solid #60a5fa;
+        }
+        .step h3 {
+            margin: 0 0 8px 0;
+            color: #ffffff;
+            font-size: 1em;
+        }
+        .step p {
+            margin: 4px 0;
+            line-height: 1.4;
+            font-size: 0.85em;
+            color: #d1d5db;
+        }
+        .controls-info {
+            background: rgba(168, 85, 247, 0.1);
+            border-left: 3px solid #a855f7;
+        }
+        .keyboard-shortcuts {
+            background: rgba(34, 197, 94, 0.1);
+            border-left: 3px solid #22c55e;
+        }
+        .loading {
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            min-width: 300px;
+            min-height: 200px;
+            transform: translate(-50%, -50%);
+            background: rgba(0, 0, 0, 0.9);
+            color: white;
+            padding: 20px;
+            border-radius: 10px;
+            display: none;
+            z-index: 1000;
+            text-align: center;
+            vertical-align: middle;
+        }
+        .generation-info {
+            background: rgba(34, 197, 94, 0.1);
+            border: 1px solid #22c55e;
+            border-radius: 8px;
+            padding: 15px;
+            margin: 10px 0;
+            color: #22c55e;
+            font-family: 'Courier New', monospace;
+            font-size: 0.9em;
+        }
+        .progress-container {
+            width: 100%;
+            background: rgba(255, 255, 255, 0.1);
+            border-radius: 10px;
+            overflow: hidden;
+            margin: 10px 0;
+            position: relative;
+        }
+        .progress-bar {
+            height: 20px;
+            background: linear-gradient(90deg, #60a5fa, #3b82f6);
+            width: 0%;
+            transition: width 0.3s ease;
+            border-radius: 10px;
+            position: relative;
+        }
+        .progress-text {
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            transform: translate(-50%, -50%);
+            color: white;
+            font-weight: bold;
+            font-size: 0.8em;
+            white-space: nowrap;
+        }
+        /* Info tooltip */
+        .info-tip {
+            display: inline-block;
+            position: relative;
+            margin-left: 8px;
+            width: 16px;
+            height: 16px;
+            line-height: 16px;
+            text-align: center;
+            border-radius: 50%;
+            background: #3b82f6;
+            color: #fff;
+            font-size: 12px;
+            cursor: default;
+            user-select: none;
+        }
+        .info-tip .tooltip {
+            display: none;
+            position: absolute;
+            left: 0;
+            top: calc(100% + 8px); /* show below the icon */
+            transform: none;
+            background: rgba(0,0,0,0.9);
+            color: #e5e7eb;
+            border: 1px solid rgba(255,255,255,0.15);
+            border-radius: 8px;
+            padding: 10px 12px;
+            font-size: 12px;
+            width: 360px; /* wider tooltip */
+            white-space: normal;
+            z-index: 2000; /* above GUI and other elements */
+            box-shadow: 0 4px 12px rgba(0,0,0,0.4);
+        }
+        .info-tip:hover .tooltip {
+            display: block;
+        }
+        .status-bar {
+            background: rgba(0, 0, 0, 0.9);
+            color: #60a5fa;
+            padding: 8px 15px;
+            font-family: 'Courier New', monospace;
+            font-size: 0.8em;
+            border-top: 1px solid rgba(255, 255, 255, 0.1);
+            flex-shrink: 0;
+        }
+        .canvas-container {
+            width: 100%;
+            height: 100%;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            background:
+                repeating-linear-gradient(
+                    45deg,
+                    #1a1a1a 0px,
+                    #1a1a1a 10px,
+                    #2a2a2a 10px,
+                    #2a2a2a 20px
+                );
+            position: relative;
+        }
+        .canvas-wrapper {
+            position: relative;
+            border: 2px solid #444;
+            background: #111;
+            box-shadow:
+                0 0 20px rgba(0, 0, 0, 0.5),
+                inset 0 0 10px rgba(0, 0, 0, 0.3);
+            border-radius: 4px;
+        }
+        .canvas-wrapper canvas {
+            display: block;
+            border-radius: 2px;
+        }
+        /* Add a subtle animation to the canvas wrapper */
+        .canvas-wrapper:hover {
+            border-color: #666;
+            box-shadow:
+                0 0 30px rgba(0, 0, 0, 0.7),
+                inset 0 0 15px rgba(0, 0, 0, 0.4);
+        }
+        /* Progress & status beautify */
+        .progress-container {
+            width: 100%;
+            height: 18px;
+            background: linear-gradient(180deg, rgba(255,255,255,0.06), rgba(255,255,255,0.02));
+            border: 1px solid rgba(255,255,255,0.12);
+            border-radius: 999px;
+            overflow: hidden;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.35) inset;
+            position: relative;
+        }
+        .progress-bar {
+            height: 100%;
+            background: linear-gradient(90deg, #60a5fa, #8b5cf6);
+            box-shadow: 0 0 10px rgba(96,165,250,0.65);
+            position: relative;
+            transition: width .15s ease;
+        }
+        .progress-text {
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            transform: translate(-50%, -50%);
+            font-size: 11px;
+            color: #f8fafc;
+            text-shadow: 0 1px 2px rgba(0,0,0,0.5);
+            pointer-events: none;
+            white-space: nowrap;
+        }
+        .status-badges {
+            display: flex;
+            gap: 8px;
+            flex-wrap: wrap;
+            margin-top: 8px;
+        }
+        .badge {
+            display: inline-flex;
+            align-items: center;
+            gap: 6px;
+            padding: 6px 10px;
+            border-radius: 8px;
+            font-size: 12px;
+            border: 1px solid rgba(255,255,255,0.12);
+            background: rgba(255,255,255,0.06);
+        }
+        .badge .dot { width: 8px; height: 8px; border-radius: 999px; }
+        .badge.queue .dot { background: #f59e0b; }
+        .badge.running .dot { background: #22c55e; }
+        .badge.time .dot { background: #60a5fa; }
+        .badge.bytes .dot { background: #a78bfa; }
+        .details-grid {
+            display: grid;
+            grid-template-columns: repeat(2, minmax(0, 1fr));
+            gap: 6px 12px;
+            margin-top: 8px;
+            font-size: 12px;
+            color: #cbd5e1;
+        }
+        .details-grid div { opacity: 0.9; }
+        /* Canvas resizing indicator */
+        .canvas-wrapper.resizing {
+            border-color: #60a5fa;
+            box-shadow:
+                0 0 25px rgba(96, 165, 250, 0.3),
+                inset 0 0 10px rgba(96, 165, 250, 0.1);
+        }
+        .canvas-wrapper.resizing::after {
+            content: "Resizing...";
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            transform: translate(-50%, -50%);
+            color: #60a5fa;
+            font-size: 12px;
+            font-weight: bold;
+            z-index: 10;
+            pointer-events: none;
+        }
+        /* GUI Panel Styling */
+        .gui-panel {
+            background: rgba(0, 0, 0, 0.8);
+            border-radius: 8px;
+            padding: 15px;
+            min-height: 400px;
+        }
+        .gui-panel .lil-gui {
+            --background-color: rgba(0, 0, 0, 0.8);
+            --text-color: #ffffff;
+            --title-background-color: rgba(96, 165, 250, 0.2);
+            --title-text-color: #ffffff;
+            --widget-color: rgba(96, 165, 250, 0.3);
+            --hover-color: rgba(96, 165, 250, 0.5);
+        }
+        /* Ensure GUI is visible */
+        .lil-gui {
+            position: relative !important;
+            z-index: 1000 !important;
+        }
+        @media (max-width: 1200px) {
+            .left-panel {
+                width: 250px;
+            }
+            .right-panel {
+                width: 280px;
+            }
+        }
+        @media (max-width: 768px) {
+            .content-container {
+                flex-direction: column;
+            }
+            .left-panel, .right-panel {
+                width: 100%;
+                height: auto;
+                max-height: 200px;
+            }
+            .center-panel {
+                flex: 1;
+                min-height: 400px;
+            }
+        }
+    </style>
+<script type="importmap">
+  {
+    "imports": {
+      "three": "https://cdnjs.cloudflare.com/ajax/libs/three.js/0.174.0/three.module.js",
+      "@sparkjsdev/spark": "https://sparkjs.dev/releases/spark/0.1.6/spark.module.js",
+      "lil-gui": "https://cdn.jsdelivr.net/npm/lil-gui@0.20/+esm"
+    }
+  }
+</script>
+</head>
+<body>
+    <div class="main-container">
+        <!-- Header Section -->
+        <header class="header">
+            <div style="display: flex; justify-content: space-between; align-items: center; width: 100%;">
+                <h1 style="margin: 0; flex: 1; text-align: left;">
+                  <span class="header-title-wrap">FlashWorld Spark Demo
+                    <span class="info-tip">!
+                      <span class="tooltip" style="max-width: 260px; text-align: left;">Note: Front-end real-time rend ering in Spark uses compressed Gaussian Splat attributes. Visual quality in this demo may be lower than offline/back-end rendering.
+                      Also, the generation is fast but the downloading may be slow, please be patient.
+                      </span>
+                    </span>
+                  </span>
+                </h1>
+                <div class="header-links" style="margin-left: 20px;">
+                    <a href="#" target="_blank">Paper</a>
+                    <a href="#" target="_blank">Code</a>
+                    <a href="#" target="_blank">Project Page</a>
+                </div>
+            </div>
+        </header>
+        <!-- Main Content Container -->
+        <div class="content-container">
+            <!-- Left Panel: Simplified Guidance -->
+            <div class="left-panel">
+                <div class="guidance">
+                    <h2>Instructions</h2>
+                    <div class="step">
+                        <h3>1. Configure</h3>
+                        <p>Set FOV and Resolution and Click "Fix Configurations"</p>
+                    </div>
+                    <div class="step">
+                        <h3>2. Set Camera Trajectory</h3>
+                        <p><b>Manual:</b> Navigate with mouse and keyboard, press <kbd>Space</kbd> to record</p>
+                        <p><b>Template:</b> Select template type and click "Generate Trajectory"</p>
+                        <p><b>JSON:</b> Load trajectory from JSON file</p>
+                    </div>
+                    <div class="step">
+                        <h3>3. Add Prompts</h3>
+                        <p>Upload image or enter text description</p>
+                    </div>
+                    <div class="step">
+                        <h3>4. Generate</h3>
+                        <p>Click "Generate!" to create your scene</p>
+                    </div>
+                    <div class="step controls-info">
+                        <h3>Controls</h3>
+                        <p><strong>Mouse/QE:</strong> Rotate view</p>
+                        <p><strong>WASD/RF:</strong> Move</p>
+                        <p><strong>Space:</strong> Record camera</p>
+                    </div>
+                </div>
+            </div>
+            <!-- Center Panel: Canvas -->
+            <div class="center-panel">
+                <div class="canvas-container" id="canvas-container">
+                    <div class="canvas-wrapper" id="canvas-wrapper">
+                        <div class="loading" id="loading">
+                            <h3>🎬 Generating Scene...</h3>
+                            <p>Please wait while we create your 3D scene</p>
+                            <div id="generation-info" class="generation-info" style="display: none;">
+                                <div><strong>Generation Time:</strong> <span id="generation-time">-</span> seconds</div>
+                                <div><strong>File Size:</strong> <span id="file-size">-</span> MB</div>
+                            </div>
+                            <div id="download-progress" style="display: none;">
+                                <div class="progress-container">
+                                    <div class="progress-bar" id="progress-bar"></div>
+                                    <div class="progress-text" id="progress-text">0%</div>
+                                </div>
+                                <div class="status-badges" id="status-badges" style="display: none;">
+                                    <div class="badge queue" id="badge-queue"><span class="dot"></span><span id="badge-queue-text">Queue</span></div>
+                                    <div class="badge running" id="badge-running" style="display: none;"><span class="dot"></span><span id="badge-running-text">Running</span></div>
+                                    <div class="badge time" id="badge-time" style="display: none;"><span class="dot"></span><span id="badge-time-text">00:00</span></div>
+                                </div>
+                                <div id="queue-details" class="details-grid" style="display: none;"></div>
+                                <div id="download-details" class="details-grid" style="display: none;"></div>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+            <!-- Right Panel: GUI -->
+            <div class="right-panel">
+              <div class="gui-container">
+                <!-- <h2>GUI</h2> -->
+                <div class="gui-panel" id="gui-container">
+                    <!-- GUI will be inserted here -->
+                </div>
+              </div>
+                <!-- Image Preview Area -->
+                <div id="image-preview-area" style="padding: 10px; display: none;">
+                    <div style="font-size: 12px; color: #ccc; margin-bottom: 8px; text-align: left;">Input Image Preview</div>
+                    <div style="text-align: center;">
+                        <img id="preview-img" style="max-width: 100%; max-height: 200px; border-radius: 4px; box-shadow: 0 2px 8px rgba(0,0,0,0.3);" />
+                    </div>
+                </div>
+            </div>
+        </div>
+        <!-- Status Bar -->
+        <div class="status-bar" id="status-bar">
+            Ready to generate 3D scenes | Cameras: 0 | Status: Waiting for input
+        </div>
+    </div>
+    <!-- Hidden File Inputs -->
+<input id="file-input" type="file" accept=".jpg,.png,.jpeg" multiple="true" style="display: none;" />
+<input id="json-input" type="file" accept=".json" multiple="false" style="display: none;" />
+<script type="module">
+  // =========================
+  // Imports & Global Variables
+  // =========================
+  import * as THREE from "three";
+  import { SplatMesh, SparkControls, textSplats } from "@sparkjsdev/spark";
+  import GUI from "lil-gui";
+  // Scene, Camera, Renderer, Controls
+  const scene = new THREE.Scene();
+  const camera = new THREE.PerspectiveCamera(60, window.innerWidth / window.innerHeight, 0.1, 1000);
+  camera.position.set(0, 0, 1.5);
+  const renderer = new THREE.WebGLRenderer();
+  renderer.setSize(window.innerWidth, window.innerHeight);
+  // Wait for DOM to be ready
+  function initializeRenderer() {
+    const canvasWrapper = document.getElementById('canvas-wrapper');
+    if (canvasWrapper) {
+      canvasWrapper.appendChild(renderer.domElement);
+      // Set initial canvas size based on current resolution
+      updateCanvasSize();
+      console.log('Canvas initialized in wrapper');
+    } else {
+      console.error('Canvas wrapper not found');
+    }
+  }
+  // Update canvas size based on selected resolution
+  function updateCanvasSize() {
+    const canvasWrapper = document.getElementById('canvas-wrapper');
+    if (!canvasWrapper) return;
+    // Show resizing indicator
+    canvasWrapper.classList.add('resizing');
+    // Get current resolution from GUI options
+    const resolution = guiOptions.Resolution.split('x');
+    const width = parseInt(resolution[2]) || 704;  // W
+    const height = parseInt(resolution[1]) || 480; // H
+    // Set canvas size
+    renderer.setSize(width, height);
+    camera.aspect = width / height;
+    camera.updateProjectionMatrix();
+    // Update wrapper size to match canvas
+    canvasWrapper.style.width = width + 'px';
+    canvasWrapper.style.height = height + 'px';
+    // Remove resizing indicator after a short delay
+    setTimeout(() => {
+      canvasWrapper.classList.remove('resizing');
+    }, 300);
+    console.log('Canvas size updated:', width, 'x', height);
+  }
+  const controls = new SparkControls({ canvas: renderer.domElement });
+  // Camera splats and params
+  const cameraSplats = [];
+  const cameraParams = [];
+  const interpolatedCamerasSplats = [];
+  // State
+  let fixGenerationFOV = false;
+  let inputImageBase64 = null;
+  let inputImageResolution = null;
+  let currentGeneratedSplat = null; // 跟踪当前生成的场景
+  // UI Elements
+  const loadingElement = document.getElementById('loading');
+  const statusBar = document.getElementById('status-bar');
+  // GUI variable - declare early
+  let gui = null;
+  // Status update function
+  function updateStatus(message, cameraCount = null) {
+    const cameraText = cameraCount !== null ? `Cameras: ${cameraCount}` : `Cameras: ${cameraParams.length}`;
+    statusBar.textContent = `${message} | ${cameraText} | Status: ${fixGenerationFOV ? 'Ready to record' : 'Configure settings'}`;
+  }
+  // Show/hide loading
+  function showLoading(show) {
+    loadingElement.style.display = show ? 'block' : 'none';
+  }
+  // Show generation info
+  function showGenerationInfo(generationTime, fileSize) {
+    const generationInfo = document.getElementById('generation-info');
+    const generationTimeElement = document.getElementById('generation-time');
+    const fileSizeElement = document.getElementById('file-size');
+    generationTimeElement.textContent = generationTime.toFixed(2);
+    fileSizeElement.textContent = (fileSize / (1024 * 1024)).toFixed(2);
+    generationInfo.style.display = 'block';
+  }
+  // Show download progress
+  function showDownloadProgress() {
+    const downloadProgress = document.getElementById('download-progress');
+    downloadProgress.style.display = 'block';
+    const qd = document.getElementById('queue-details');
+    const dd = document.getElementById('download-details');
+    const badges = document.getElementById('status-badges');
+    if (qd) qd.style.display = 'none';
+    if (dd) dd.style.display = 'none';
+    if (badges) badges.style.display = 'none';
+  }
+  // Update progress bar
+  function updateProgressBar(percentage) {
+    const progressBar = document.getElementById('progress-bar');
+    const progressText = document.getElementById('progress-text');
+    progressBar.style.width = percentage + '%';
+    progressText.textContent = `${Math.round(percentage)}%`;
+  }
+  // Update progress label text (stage indicator)
+  function setProgressLabel(text) {
+    const progressText = document.getElementById('progress-text');
+    if (progressText) progressText.textContent = text;
+  }
+  // ==============
+  // Queue handling
+  // ==============
+  let queuePollTimer = null;
+  let currentTaskId = null;
+  let initialQueuePosition = null;
+  let latestGenerationTime = null;
+  let lastDownloadPct = 0;
+  let lastDownloadUpdateTs = 0;
+  function showQueueWaiting(position, runningCount, queuedCount) {
+    // Use only the progress bar to show queue progress (from initial position to 0)
+    showDownloadProgress();
+    if (initialQueuePosition === null) {
+      // Initialize from first seen position; ensure >= 1 so 0 -> 100%
+      const initPos = (typeof position === 'number') ? position : 0;
+      initialQueuePosition = Math.max(initPos, 1);
+    }
+    const percent = initialQueuePosition && initialQueuePosition > 0
+      ? Math.max(0, Math.min(100, ((initialQueuePosition - (position || 0)) / initialQueuePosition) * 100))
+      : 0;
+    updateProgressBar(percent);
+    const totalWaiting = (position || 0) + (queuedCount || 0);
+    if (position !== null && position !== undefined) {
+      const pctText = `${Math.round(percent)}%`;
+      if (totalWaiting > 0) {
+        setProgressLabel(`Queued ${position}/${totalWaiting} (${pctText})`);
+      } else {
+        setProgressLabel(`Queued ${position} (${pctText})`);
+      }
+    } else {
+      setProgressLabel('Queued');
+    }
+  }
+  async function pollTaskUntilReady(taskId) {
+    currentTaskId = taskId;
+    initialQueuePosition = null;
+    if (queuePollTimer) {
+      clearInterval(queuePollTimer);
+      queuePollTimer = null;
+    }
+    const queueStartTs = Date.now();
+    const pollOnce = async () => {
+      try {
+        const resp = await fetch(`${guiOptions.BackendAddress}/task/${taskId}`);
+        if (!resp.ok) return;
+        const info = await resp.json();
+        if (!info || !info.success) return;
+        const pos = info.queue && typeof info.queue.position === 'number' ? info.queue.position : 0;
+        const running = info.queue ? info.queue.running_count : 0;
+        const queued = info.queue ? info.queue.queued_count : 0;
+        if (info.status === 'queued' || info.status === 'running') {
+          // Only progress bar; set stage label
+          if (info.status === 'queued') {
+            showQueueWaiting(pos, running, queued);
+          } else {
+            // Transitioned to running: finalize queue progress visually
+            updateProgressBar(100);
+            showDownloadProgress();
+            setProgressLabel('Generating...');
+          }
+        }
+        if (info.status === 'completed' && info.download_url) {
+          clearInterval(queuePollTimer);
+          queuePollTimer = null;
+          latestGenerationTime = typeof info.generation_time === 'number' ? info.generation_time : null;
+          // Proceed to download the generated file like the normal path
+          updateStatus('Downloading generated scene...', cameraParams.length);
+          const response = await fetch(guiOptions.BackendAddress + info.download_url);
+          if (!response.ok) throw new Error(`HTTP error! status: ${response.status}`);
+          const contentLength = response.headers.get('content-length');
+          const total = parseInt(contentLength || '0', 10);
+          // Show generation info immediately once we know it and total size from headers
+          showGenerationInfo(latestGenerationTime || 0, total);
+          let loaded = 0;
+          const reader = response.body.getReader();
+          const chunks = [];
+          updateProgressBar(0);
+          setProgressLabel('Downloading 0%');
+          lastDownloadPct = 0;
+          lastDownloadUpdateTs = 0;
+          while (true) {
+            const { done, value } = await reader.read();
+            if (done) break;
+            chunks.push(value);
+            loaded += value.length;
+            if (total) {
+              const pct = Math.min(100, (loaded / total) * 100);
+              const now = Date.now();
+              const rounded = Math.round(pct);
+              // Throttle and enforce monotonic increase
+              if (rounded > Math.round(lastDownloadPct) || (now - lastDownloadUpdateTs) > 200) {
+                lastDownloadPct = Math.max(lastDownloadPct, pct);
+                updateProgressBar(lastDownloadPct);
+                setProgressLabel(`Downloading ${Math.round(lastDownloadPct)}%`);
+                lastDownloadUpdateTs = now;
+              }
+            }
+          }
+          if (instructionSplat) {
+            scene.remove(instructionSplat);
+            console.log('Instruction splat removed');
+            instructionSplat = null;
+          }
+          const blob = new Blob(chunks);
+          const url = URL.createObjectURL(blob);
+          // Continue to load the splat
+          updateStatus('Loading generated scene...', cameraParams.length);
+          const GeneratedSplat = new SplatMesh({ url });
+          scene.add(GeneratedSplat);
+          currentGeneratedSplat = GeneratedSplat;
+          updateStatus('Scene generated successfully!', cameraParams.length);
+          // Show generation time and total file size (MB)
+          showGenerationInfo(latestGenerationTime || 0, total || blob.size);
+          // Notify backend to delete the server file after client has downloaded it
+          try {
+            if (info.file_id) {
+              const resp = await fetch(`${guiOptions.BackendAddress}/delete/${info.file_id}`, { method: 'POST' });
+              if (!resp.ok) console.warn('Delete notify failed');
+            }
+          } catch (e) {
+            console.warn('Delete notify error', e);
+          }
+          hideDownloadProgress();
+          showLoading(false);
+        } else if (info.status === 'failed') {
+          clearInterval(queuePollTimer);
+          queuePollTimer = null;
+          throw new Error(info.error || 'Generation failed');
+        }
+      } catch (e) {
+        console.debug('Polling error:', e);
+      }
+    };
+    await pollOnce();
+    queuePollTimer = setInterval(pollOnce, 2000);
+  }
+  // Hide download progress
+  function hideDownloadProgress() {
+    const downloadProgress = document.getElementById('download-progress');
+    downloadProgress.style.display = 'none';
+  }
+  // Playback scrubber (0..1)
+  let userCameraState = null; // 存储用户播放前的相机状态
+  // 根据时间比例获取插值相机
+  function getInterpolatedCameraAtTime(t) {
+    if (cameraParams.length === 0) {
+      return camera;
+    }
+    if (cameraParams.length === 1) {
+      return cameraParams[0];
+    }
+    // 确保t在有效范围内
+    const clampedT = Math.max(0, Math.min(1, t));
+    // 计算在相机序列中的位置
+    const cameraIndex = clampedT * (cameraParams.length - 1);
+    const startIndex = Math.min(Math.floor(cameraIndex), cameraParams.length - 2);
+    const endIndex = startIndex + 1;
+    const startCamera = cameraParams[startIndex];
+    const endCamera = cameraParams[endIndex];
+    // 计算两个相机之间的插值比例
+    const _t = cameraIndex - startIndex;
+    // 使用interpolateTwoCameras进行插值
+    return interpolateTwoCameras(startCamera, endCamera, _t);
+  }
+  function setCameraByScrub(t) {
+    if (cameraParams.length === 0) return;
+    const clampedT = Math.max(0, Math.min(1, t));
+    const camT = getInterpolatedCameraAtTime(clampedT);
+    camera.position.copy(camT.position);
+    camera.quaternion.copy(camT.quaternion);
+    camera.fov = camT.fov;
+    camera.updateProjectionMatrix();
+  }
+  // Supported resolutions
+  const supportedResolutions = [
+    { frame: 24, width: 704, height: 480 },
+    { frame: 24, width: 480, height: 704 }
+  ];
+  // GUI Options - declare early
+  const guiOptions = {
+    // 后端地址，默认为本页面ip
+    BackendAddress: `${window.location.protocol}//${window.location.hostname}:7860`,
+    FOV: 60,
+    LoadFromJson: () => {
+      const jsonInput = document.querySelector("#json-input");
+      if (jsonInput) jsonInput.click();
+    },
+        LoadTrajectoryFromJson: () => {
+          if (!fixGenerationFOV) {
+            updateStatus('Warning: Please fix configuration first before loading trajectory', cameraParams.length);
+            return;
+          }
+          // 设置标志，表示只加载轨迹
+          window.loadTrajectoryOnly = true;
+          const jsonInput = document.querySelector("#json-input");
+          if (jsonInput) jsonInput.click();
+        },
+    fixGenerationFOV: () => {
+      // These controllers will be set when GUI is initialized
+      if (window.fixGenerationFOVController) window.fixGenerationFOVController.disable();
+      fixGenerationFOV = true;
+      const new_camera = new THREE.PerspectiveCamera(guiOptions.FOV, guiOptions.Resolution.split('x')[2] / guiOptions.Resolution.split('x')[1]);
+      new_camera.position.set(0, 0, 0);
+      new_camera.quaternion.set(0, 0, 0, 1);
+      new_camera.updateProjectionMatrix();
+      const cameraSplat = createCameraSplat(new_camera);
+      cameraSplats.push(cameraSplat);
+      cameraParams.push({
+        position: new_camera.position.clone(),
+        quaternion: new_camera.quaternion.clone(),
+        fov: new_camera.fov,
+        aspect: new_camera.aspect,
+      });
+      scene.add(cameraSplat);
+      updateStatus('Camera settings fixed. Press Space to record cameras.', cameraParams.length);
+    },
+    Resolution: `${supportedResolutions[0].frame}x${supportedResolutions[0].height}x${supportedResolutions[0].width}`,
+    VisualizeCameraSplats: true,
+    VisualizeInterpolatedCameras: true,
+    inputImagePrompt: () => {
+      const fileInput = document.querySelector("#file-input");
+      if (fileInput) {
+        // 仅触发选择，由全局处理程序完成裁剪与预览更新
+        fileInput.click();
+      }
+    },
+    imageIndex: 0,
+    inputTextPrompt: "",
+    // Camera trajectory templates
+    trajectoryMode: "Manual",
+    templateType: "Move Forward",
+    cameraTrajectory: "Manual",
+    trajectorySettings: {
+      angle: 180,      // 角度 (180, 360)
+      tilt: 15         // 倾斜角 (15, 30, 45)
+    },
+    generateTrajectory: () => {
+      generateCameraTrajectory(guiOptions.templateType);
+    },
+    saveTrajectoryToJson: () => {
+      if (cameraParams.length === 0) {
+        updateStatus('No cameras to save.', cameraParams.length);
+        console.warn('No cameras to save');
+        return;
+      }
+      // Build JSON payload compatible with loader
+      const [nStr, hStr, wStr] = guiOptions.Resolution.split('x');
+      const n = parseInt(nStr), h = parseInt(hStr), w = parseInt(wStr);
+      const payload = {
+        // image_prompt: null,
+        // text_prompt: guiOptions.inputTextPrompt || "",
+        // image_index: guiOptions.imageIndex || 0,
+        // resolution: [n, h, w],
+        cameras: cameraParams.map(cam => ({
+          position: [cam.position.x, cam.position.y, cam.position.z],
+          quaternion: [cam.quaternion.w, cam.quaternion.x, cam.quaternion.y, cam.quaternion.z]
+        }))
+      };
+      const blob = new Blob([JSON.stringify(payload, null, 2)], { type: 'application/json' });
+      const url = URL.createObjectURL(blob);
+      const a = document.createElement('a');
+      a.href = url;
+      a.download = `trajectory_${Date.now()}.json`;
+      document.body.appendChild(a);
+      a.click();
+      document.body.removeChild(a);
+      URL.revokeObjectURL(url);
+      updateStatus('Trajectory saved to JSON.', cameraParams.length);
+    },
+    clearAllCameras: () => {
+      if (cameraParams.length <= 1) {
+        updateStatus('No cameras to clear (first camera is always preserved)', cameraParams.length);
+        return;
+      }
+      // Keep the first camera, remove all others
+      const firstCamera = cameraParams[0];
+      const firstSplat = cameraSplats[0];
+      // Remove all camera splats except the first one
+      for (let i = cameraSplats.length - 1; i >= 1; i--) {
+        scene.remove(cameraSplats[i]);
+      }
+      // Keep only the first camera in arrays
+      cameraSplats.length = 1;
+      cameraParams.length = 1;
+      // Clear all interpolated camera splats from scene
+      interpolatedCamerasSplats.forEach(splat => scene.remove(splat));
+      interpolatedCamerasSplats.length = 0;
+      updateStatus('Cameras cleared (first camera preserved). Ready to add more cameras.', 1);
+      console.log('Cameras cleared, first camera preserved');
+    },
+    // Playback scrub value (0..1)
+    playbackT: 0,
+    generate: () => {
+      // 检查是否有足够的相机
+      if (cameraParams.length < 2) {
+        console.error('Need at least 2 cameras to generate. Please press Space to record more cameras.');
+        updateStatus('Error: Need at least 2 cameras', cameraParams.length);
+        return;
+      }
+      updateStatus('Preparing generation...', cameraParams.length);
+      // 删除之前生成的场景
+      if (currentGeneratedSplat) {
+        scene.remove(currentGeneratedSplat);
+        currentGeneratedSplat = null;
+        console.log('Previous generated scene removed');
+      }
+      // 初始化进度条信息
+      const generationTimeElement = document.getElementById('generation-time');
+      const fileSizeElement = document.getElementById('file-size');
+      const progressBar = document.getElementById('progress-bar');
+      const progressText = document.getElementById('progress-text');
+      if (generationTimeElement) generationTimeElement.textContent = '-';
+      if (fileSizeElement) fileSizeElement.textContent = '-';
+      if (progressBar) progressBar.style.width = '0%';
+      if (progressText) progressText.textContent = '0%';
+      // 隐藏生成信息和下载进度
+      const generationInfo = document.getElementById('generation-info');
+      const downloadProgress = document.getElementById('download-progress');
+      if (generationInfo) generationInfo.style.display = 'none';
+      if (downloadProgress) downloadProgress.style.display = 'none';
+      showLoading(true);
+      // 生成插值相机并可视化
+      const interpolatedCameras = interpolateCameras(cameraParams, parseInt(guiOptions.Resolution.split('x')[0]));
+      interpolatedCameras.forEach(cam => {
+        const interpolatedCameraSplat = createCameraSplat(cam, [0.5, 0.5, 0.5]);
+        interpolatedCamerasSplats.push(interpolatedCameraSplat);
+        scene.add(interpolatedCameraSplat);
+      });
+      console.log('Sending request to backend...');
+      console.log('Interpolated cameras:', interpolatedCameras.length);
+      updateStatus('Sending request to backend...', cameraParams.length);
+      // 根据后端类型选择不同的请求方式
+      let requestUrl, requestBody;
+      if (true) {
+        // Flask后端：直接POST到/generate
+        requestUrl = guiOptions.BackendAddress + '/generate';
+        requestBody = JSON.stringify({
+          image_prompt: inputImageBase64 ? inputImageBase64 : "",
+          text_prompt: guiOptions.inputTextPrompt,
+          image_index: 0,
+          resolution: [
+            parseInt(guiOptions.Resolution.split('x')[0]),
+            parseInt(guiOptions.Resolution.split('x')[1]),
+            parseInt(guiOptions.Resolution.split('x')[2])
+          ],
+          cameras: interpolatedCameras.map(cam => ({
+            position: [cam.position.x, cam.position.y, cam.position.z],
+            quaternion: [cam.quaternion.w, cam.quaternion.x, cam.quaternion.y, cam.quaternion.z],
+            fx: 0.5 / Math.tan(0.5 * cam.fov * Math.PI / 180) * parseInt(guiOptions.Resolution.split('x')[1]),
+            fy: 0.5 / Math.tan(0.5 * cam.fov * Math.PI / 180) * parseInt(guiOptions.Resolution.split('x')[1]),
+            cx: inputImageBase64 && inputImageResolution
+              ? 0.5 * inputImageResolution.width
+              : 0.5 * parseInt(guiOptions.Resolution.split('x')[2]),
+            cy: inputImageBase64 && inputImageResolution
+              ? 0.5 * inputImageResolution.height
+              : 0.5 * parseInt(guiOptions.Resolution.split('x')[1]),
+          }))
+        });
+      } else {
+      }
+      // 请求后端生成（异步：返回task_id并开始排队轮询）
+      fetch(requestUrl, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        mode: 'cors',
+        body: requestBody
+      })
+      .then(response => {
+        const contentType = response.headers.get('content-type');
+        if (contentType && contentType.includes('application/json')) {
+          return response.json();
+        } else {
+          return response.blob().then(blob => {
+            const url = URL.createObjectURL(blob);
+            return { url };
+          });
+        }
+      })
+      .then(data => {
+        console.log(data);
+        {
+          // 异步队列协议：后端返回 task_id + queue 信息（202）
+          if (data && data.success && data.task_id) {
+            updateStatus('Queued request submitted. Waiting in queue...', cameraParams.length);
+            showQueueWaiting(data.queue?.position || 0, data.queue?.running_count || 0, data.queue?.queued_count || 0);
+            // 轮询直到任务完成并下载
+            return pollTaskUntilReady(data.task_id).then(() => ({ url: null }));
+          }
+          // 兼容旧的直接文件响应格式
+          if (data && data.url) {
+            updateStatus('Loading generated scene...', cameraParams.length);
+            return Promise.resolve(data);
+          }
+          throw new Error('Invalid Flask response (expected task_id)');
+        }
+      })
+      .then(data => {
+        if (data.url) {
+          updateStatus('Loading 3D scene...', cameraParams.length);
+          // Remove the instruction splat when generation is complete
+          if (instructionSplat) {
+            scene.remove(instructionSplat);
+            console.log('Instruction splat removed');
+          }
+          const GeneratedSplat = new SplatMesh({ url: data.url });
+          scene.add(GeneratedSplat);
+          currentGeneratedSplat = GeneratedSplat; // 保存新生成的场景引用
+          console.log('3D scene loaded successfully!');
+          updateStatus('Scene generated successfully!', cameraParams.length);
+          hideDownloadProgress();
+          showLoading(false);
+        }
+      })
+      .catch(error => {
+        console.error('Error:', error);
+        updateStatus('Generation failed: ' + error.message, cameraParams.length);
+        hideDownloadProgress();
+        showLoading(false);
+      });
+    }
+  };
+  // Initialize renderer and GUI when DOM is ready
+  function initializeApp() {
+    try {
+      // Debug layout
+      console.log('Initializing app...');
+      console.log('Center panel:', document.querySelector('.center-panel'));
+      console.log('GUI container:', document.getElementById('gui-container'));
+      console.log('Right panel:', document.querySelector('.right-panel'));
+      initializeRenderer();
+      initializeGUI();
+      console.log('App initialization complete');
+    } catch (error) {
+      console.error('App initialization failed:', error);
+    }
+  }
+  if (document.readyState === 'loading') {
+    document.addEventListener('DOMContentLoaded', initializeApp);
+  } else {
+    initializeApp();
+  }
+  // =========================
+  // Utility & Core Functions
+  // =========================
+  // 计算插值相机
+  function interpolateTwoCameras(startCamera, endCamera, _t) {
+    const interpolatedCamera = new THREE.PerspectiveCamera(startCamera.fov, startCamera.aspect);
+    // 如果_t接近0，直接使用startCamera
+    if (_t < 1e-6) {
+      interpolatedCamera.position.copy(startCamera.position);
+      interpolatedCamera.quaternion.copy(startCamera.quaternion);
+    }
+    // 如果_t接近1，直接使用endCamera
+    else if (_t > 1 - 1e-6) {
+      interpolatedCamera.position.copy(endCamera.position);
+      interpolatedCamera.quaternion.copy(endCamera.quaternion);
+    }
+    // 否则进行插值
+    else {
+      interpolatedCamera.position.copy(startCamera.position).lerp(endCamera.position, _t);
+      interpolatedCamera.quaternion.copy(startCamera.quaternion).slerp(endCamera.quaternion, _t);
+    }
+    return interpolatedCamera;
+  }
+  function interpolateCameras(cameras, M) {
+    const interpolatedCameras = [];
+    if (cameras.length === 0) {
+      return interpolatedCameras;
+    }
+    if (cameras.length === 1) {
+      // 如果只有一个相机，重复使用它
+      for (let i = 0; i < M; i++) {
+        interpolatedCameras.push(cameras[0]);
+      }
+      return interpolatedCameras;
+    }
+    for (let i = 0; i < M; i++) {
+      const t = i / (M - 1);
+      const startIndex = Math.min(Math.floor(t * (cameras.length - 1)), cameras.length - 2);
+      const endIndex = startIndex + 1;
+      const startCamera = cameras[startIndex];
+      const endCamera = cameras[endIndex];
+      const _t = t * (cameras.length - 1) - startIndex;
+      const interpolatedCamera = interpolateTwoCameras(startCamera, endCamera, _t);
+      interpolatedCameras.push(interpolatedCamera);
+    }
+    return interpolatedCameras;
+  }
+  // 创建立方体的splat可视化
+  function createCubeSplat(size = 0.1, pointColor = [1, 1, 1]) {
+    const cubeSplat = new SplatMesh({
+      constructSplats: (splats) => {
+        const NUM_SPLATS_PER_EDGE = 1000;
+        const scales = new THREE.Vector3().setScalar(0.002);
+        const quaternion = new THREE.Quaternion();
+        const opacity = 1;
+        const color = new THREE.Color(...pointColor);
+        // 立方体的8个顶点
+        const halfSize = size / 2;
+        const vertices = [
+          new THREE.Vector3(-halfSize, -halfSize, -halfSize), // 0: 左下后
+          new THREE.Vector3(halfSize, -halfSize, -halfSize),  // 1: 右下后
+          new THREE.Vector3(halfSize, halfSize, -halfSize),   // 2: 右上后
+          new THREE.Vector3(-halfSize, halfSize, -halfSize),  // 3: 左上后
+          new THREE.Vector3(-halfSize, -halfSize, halfSize),  // 4: 左下前
+          new THREE.Vector3(halfSize, -halfSize, halfSize),   // 5: 右下前
+          new THREE.Vector3(halfSize, halfSize, halfSize),    // 6: 右上前
+          new THREE.Vector3(-halfSize, halfSize, halfSize), // 7: 左上前
+        ];
+        // 立方体的12条边
+        const edges = [
+          [0, 1], [1, 2], [2, 3], [3, 0], // 后面4条边
+          [4, 5], [5, 6], [6, 7], [7, 4], // 前面4条边
+          [0, 4], [1, 5], [2, 6], [3, 7], // 连接前后4条边
+        ];
+        // 为每条边生成splat点
+        for (let i = 0; i < edges.length; i++) {
+          const start = vertices[edges[i][0]];
+          const end = vertices[edges[i][1]];
+          for (let j = 0; j < NUM_SPLATS_PER_EDGE; j++) {
+            const point = new THREE.Vector3().lerpVectors(start, end, j / NUM_SPLATS_PER_EDGE);
+            splats.pushSplat(point, scales, quaternion, opacity, color);
+          }
+        }
+      },
+    });
+    return cubeSplat;
+  }
+  // 创建相机锥体的splat可视化
+  function createCameraSplat(camera, pointColor = [1, 1, 1]) {
+    const cameraSplat = new SplatMesh({
+      constructSplats: (splats) => {
+        const NUM_SPLATS_PER_EDGE = 1000;
+        const LENGTH_PER_EDGE = 0.1;
+        const center = new THREE.Vector3();
+        const scales = new THREE.Vector3().setScalar(0.001);
+        const quaternion = new THREE.Quaternion();
+        const opacity = 1;
+        const color = new THREE.Color(...pointColor);
+        const H = 1000;
+        const W = 1000 * camera.aspect;
+        const fx = 0.5 * H / Math.tan(0.5 * camera.fov * Math.PI / 180);
+        const fy = 0.5 * H / Math.tan(0.5 * camera.fov * Math.PI / 180);
+        const xt = (0 - W / 2 + 0.5) / fy;
+        const xb = (W - W / 2 + 0.5) / fy;
+        const yl = - (0 - H / 2 + 0.5) / fx;
+        const yr = - (H - H / 2 + 0.5) / fx;
+        const lt = new THREE.Vector3(xt * LENGTH_PER_EDGE, yl * LENGTH_PER_EDGE, -1 * LENGTH_PER_EDGE);
+        const rt = new THREE.Vector3(xt * LENGTH_PER_EDGE, yr * LENGTH_PER_EDGE, -1 * LENGTH_PER_EDGE);
+        const lb = new THREE.Vector3(xb * LENGTH_PER_EDGE, yl * LENGTH_PER_EDGE, -1 * LENGTH_PER_EDGE);
+        const rb = new THREE.Vector3(xb * LENGTH_PER_EDGE, yr * LENGTH_PER_EDGE, -1 * LENGTH_PER_EDGE);
+        const lines = [
+          [center, lt], [center, rt], [center, lb], [center, rb],
+          [lt, rt], [lt, lb], [rt, rb], [lb, rb],
+        ];
+        for (let i = 0; i < lines.length; i++) {
+          for (let j = 0; j < NUM_SPLATS_PER_EDGE; j++) {
+            const point = new THREE.Vector3().lerpVectors(lines[i][0], lines[i][1], j / NUM_SPLATS_PER_EDGE);
+            splats.pushSplat(point, scales, quaternion, opacity, color);
+          }
+        }
+      },
+    });
+    cameraSplat.quaternion.copy(camera.quaternion);
+    cameraSplat.position.copy(camera.position);
+    return cameraSplat;
+  }
+  // 生成相机轨迹模板
+  function generateCameraTrajectory(trajectoryType) {
+    if (trajectoryType === "Manual") {
+      updateStatus('Manual mode: Use Space to record cameras manually', cameraParams.length);
+      return;
+    }
+    // 检查FOV是否已固定
+    if (!fixGenerationFOV) {
+      updateStatus('Error: Please fix FOV first before generating trajectory', cameraParams.length);
+      return;
+    }
+    // 获取最后一个相机作为参考点
+    let referenceCamera;
+    if (cameraParams.length > 0) {
+      // 使用最后一个已保存的相机作为参考
+      const lastCamera = cameraParams[cameraParams.length - 1];
+      referenceCamera = new THREE.PerspectiveCamera(guiOptions.FOV, camera.aspect);
+      referenceCamera.position.copy(lastCamera.position);
+      referenceCamera.quaternion.copy(lastCamera.quaternion);
+      referenceCamera.updateProjectionMatrix();
+    } else {
+      // 如果没有已保存的相机，从原点开始
+      referenceCamera = new THREE.PerspectiveCamera(guiOptions.FOV, camera.aspect);
+      referenceCamera.position.set(0, 0, 0);
+      referenceCamera.quaternion.set(0, 0, 0, 1);
+      referenceCamera.updateProjectionMatrix();
+    }
+    // 对于orbit，计算所有相机围绕的目标点
+    // 始终使用当前参考相机（最后一个相机）来计算目标点
+    let orbitTarget = null;
+    let orbitStartCamera = null;
+    if (trajectoryType.includes("Orbit") && cameraParams.length > 0) {
+      // 使用最后一个相机作为参考，计算其前方1单位的目标点
+      orbitStartCamera = cameraParams[cameraParams.length - 1];
+      orbitTarget = orbitStartCamera.position.clone().add(
+        new THREE.Vector3(0, 0, -1).applyQuaternion(orbitStartCamera.quaternion)
+      );
+      console.log("Orbit target calculated from last camera:", orbitStartCamera.position, "->", orbitTarget);
+    } else if (trajectoryType.includes("Orbit")) {
+      // 如果没有已记录的相机，使用当前相机作为参考
+      orbitStartCamera = referenceCamera;
+      orbitTarget = referenceCamera.position.clone().add(
+        new THREE.Vector3(0, 0, -1).applyQuaternion(referenceCamera.quaternion)
+      );
+      console.log("Orbit target calculated from current camera:", referenceCamera.position, "->", orbitTarget);
+    }
+    const cameras = [];
+    const stepSize = 0.5; // 移动步长
+    const totalOrbitAngle = 15 * Math.PI / 180; // 总共15度轨道
+    // 根据轨迹类型生成相机
+    let numCameras = 1; // 默认生成1个相机
+    if (trajectoryType.includes("Orbit")) {
+      numCameras = 1; // 轨道运动生成1个相机
+      console.log(`Generating ${numCameras} orbit camera with total angle ${totalOrbitAngle * 180 / Math.PI}°`);
+    }
+    for (let i = 1; i <= numCameras; i++) {
+      const newCamera = new THREE.PerspectiveCamera(guiOptions.FOV, camera.aspect);
+      let position, quaternion;
+      switch (trajectoryType) {
+        case "Move Forward":
+          position = referenceCamera.position.clone();
+          position.z -= stepSize;
+          quaternion = referenceCamera.quaternion.clone();
+          break;
+        case "Move Backward":
+          position = referenceCamera.position.clone();
+          position.z += stepSize;
+          quaternion = referenceCamera.quaternion.clone();
+          break;
+        case "Move Left":
+          position = referenceCamera.position.clone();
+          position.x -= stepSize;
+          quaternion = referenceCamera.quaternion.clone();
+          break;
+        case "Move Right":
+          position = referenceCamera.position.clone();
+          position.x += stepSize;
+          quaternion = referenceCamera.quaternion.clone();
+          break;
+        case "Orbit Left 15°":
+          const radius = 1.0;
+          // 左轨道：-15度
+          const angle = -totalOrbitAngle;
+          console.log(`Camera ${i}: angle=${angle * 180 / Math.PI}° (Left)`);
+          // 计算轨道位置：在参考相机的局部坐标系中
+          const localOrbitPos = new THREE.Vector3(
+            Math.sin(angle) * radius,
+            0,
+            Math.cos(angle) * radius
+          );
+          // 转换到世界坐标系：旋转到参考相机的方向
+          const worldOrbitPos = localOrbitPos.applyQuaternion(orbitStartCamera.quaternion);
+          // 最终位置：从目标点出发，加上世界坐标系中的偏移
+          position = orbitTarget.clone().add(worldOrbitPos);
+          console.log(`Orbit Left camera ${i}: localPos=`, localOrbitPos, 'worldPos=', worldOrbitPos, 'finalPos=', position);
+          // 朝向：所有相机都朝向圆心（目标点）
+          const lookDirection = orbitTarget.clone().sub(position).normalize();
+          quaternion = new THREE.Quaternion().setFromUnitVectors(
+            new THREE.Vector3(0, 0, -1),
+            lookDirection
+          );
+          console.log(`Orbit Left camera ${i}: quaternion=`, quaternion);
+          break;
+        case "Orbit Right 15°":
+          const radiusRight = 1.0;
+          // 右轨道：+15度
+          const angleRight = totalOrbitAngle;
+          console.log(`Camera ${i}: angle=${angleRight * 180 / Math.PI}° (Right)`);
+          // 计算轨道位置：在参考相机的局部坐标系中
+          const localOrbitPosRight = new THREE.Vector3(
+            Math.sin(angleRight) * radiusRight,
+            0,
+            Math.cos(angleRight) * radiusRight
+          );
+          // 转换到世界坐标系：旋转到参考相机的方向
+          const worldOrbitPosRight = localOrbitPosRight.applyQuaternion(orbitStartCamera.quaternion);
+          // 最终位置：从目标点出发，加上世界坐标系中的偏移
+          position = orbitTarget.clone().add(worldOrbitPosRight);
+          console.log(`Orbit Right camera ${i}: localPos=`, localOrbitPosRight, 'worldPos=', worldOrbitPosRight, 'finalPos=', position);
+          // 朝向：所有相机都朝向圆心（目标点）
+          const lookDirectionRight = orbitTarget.clone().sub(position).normalize();
+          quaternion = new THREE.Quaternion().setFromUnitVectors(
+            new THREE.Vector3(0, 0, -1),
+            lookDirectionRight
+          );
+          console.log(`Orbit Right camera ${i}: quaternion=`, quaternion);
+          break;
+        default:
+          position = referenceCamera.position.clone();
+          quaternion = referenceCamera.quaternion.clone();
+      }
+      newCamera.position.copy(position);
+      newCamera.quaternion.copy(quaternion);
+      newCamera.updateProjectionMatrix();
+      cameras.push(newCamera);
+    }
+    // 添加相机到场景
+    cameras.forEach(cam => {
+      const cameraSplat = createCameraSplat(cam);
+      cameraSplats.push(cameraSplat);
+      cameraParams.push({
+        position: cam.position.clone(),
+        quaternion: cam.quaternion.clone(),
+        fov: cam.fov,
+        aspect: cam.aspect,
+      });
+      scene.add(cameraSplat);
+    });
+    updateStatus(`Added ${cameras.length} cameras using ${trajectoryType} trajectory`, cameraParams.length);
+    console.log(`Added ${cameras.length} cameras using ${trajectoryType} trajectory`);
+  }
+  // =========================
+  // GUI & User Interaction
+  // =========================
+  // GUI 控件 - 延迟初始化
+  function initializeGUI() {
+    const guiContainer = document.getElementById('gui-container');
+    if (guiContainer && !gui) {
+      // Clear any existing content
+      guiContainer.innerHTML = '';
+      gui = new GUI({ title: "FlashWorld Controls", container: guiContainer });
+      console.log('GUI initialized in container:', guiContainer);
+      // Step 1: Configure Generation Settings
+      const step1Folder = gui.addFolder('1. Configure Settings');
+      step1Folder.add(guiOptions, "BackendAddress").name("Backend Address");
+      // FOV和Resolution控制器，初始时启用
+      const fovController = step1Folder.add(guiOptions, "FOV", 0, 120, 1).name("FOV").onChange((value) => {
+        camera.fov = value;
+        camera.updateProjectionMatrix();
+      });
+      const resolutionController = step1Folder.add(guiOptions, "Resolution", supportedResolutions.map(
+        r => `${r.frame}x${r.height}x${r.width}`
+      )).name("Resolution (NxHxW)").onChange((value) => {
+        updateCanvasSize();
+      });
+      // Fix Configuration按钮放在最下面
+      const fixGenerationFOVController = step1Folder.add(guiOptions, "fixGenerationFOV").name("Fix Configuration");
+      step1Folder.open();
+      // Step 2: Set Up Camera Path
+      const step2Folder = gui.addFolder('2. Set Up Camera Path');
+      // Camera trajectory templates
+      const trajectoryFolder = step2Folder.addFolder('Camera Trajectory');
+      // 轨迹模式选择
+      const trajectoryModeController = trajectoryFolder.add(guiOptions, "trajectoryMode", [
+        "Manual",
+        "Template",
+        "JSON"
+      ]).name("Trajectory Mode");
+      // 模板类型选择（仅在Template模式下可用）
+      const templateTypeController = trajectoryFolder.add(guiOptions, "templateType", [
+        "Move Forward",
+        "Move Backward",
+        "Move Left",
+        "Move Right",
+        "Orbit Left 15°",
+        "Orbit Right 15°"
+      ]).name("Template Type");
+      // 生成轨迹按钮
+      const generateTrajectoryController = trajectoryFolder.add(guiOptions, "generateTrajectory").name("Generate Trajectory");
+      // 加载/保存JSON轨迹按钮
+      const loadTrajectoryController = trajectoryFolder.add(guiOptions, "LoadTrajectoryFromJson").name("Load from JSON");
+      const saveTrajectoryController = trajectoryFolder.add(guiOptions, "saveTrajectoryToJson").name("Save Trajectory");
+      // 清理相机按钮
+      const clearAllCamerasController = trajectoryFolder.add(guiOptions, "clearAllCameras").name("Clear All Cameras");
+      // 初始状态：禁用所有轨迹相关控件
+      templateTypeController.disable();
+      generateTrajectoryController.disable();
+      loadTrajectoryController.disable();
+      // 轨迹模式变化时的处理
+      trajectoryModeController.onChange((value) => {
+        if (value === "Manual") {
+          templateTypeController.disable();
+          generateTrajectoryController.disable();
+          loadTrajectoryController.disable();
+        } else if (value === "Template") {
+          templateTypeController.enable();
+          if (fixGenerationFOV) {
+            generateTrajectoryController.enable();
+          } else {
+            generateTrajectoryController.disable();
+          }
+          loadTrajectoryController.disable();
+        } else if (value === "JSON") {
+          templateTypeController.disable();
+          generateTrajectoryController.disable();
+          if (fixGenerationFOV) {
+            loadTrajectoryController.enable();
+          } else {
+            loadTrajectoryController.disable();
+          }
+        }
+      });
+      // 当Configuration固定时启用轨迹生成
+      const originalFixFOV = guiOptions.fixGenerationFOV;
+      guiOptions.fixGenerationFOV = () => {
+        originalFixFOV();
+        // Fix Configuration后禁用所有Step 1的控制器
+        fovController.disable();
+        resolutionController.disable();
+        // 根据当前轨迹模式启用相应控件
+        if (guiOptions.trajectoryMode === "Template") {
+          generateTrajectoryController.enable();
+        } else if (guiOptions.trajectoryMode === "JSON") {
+          loadTrajectoryController.enable();
+        }
+        updateStatus('Configuration fixed. You can now generate camera trajectory.', cameraParams.length);
+      };
+      trajectoryFolder.open();
+      step2Folder.add(guiOptions, "VisualizeCameraSplats").name("Visualize Cameras").onChange((value) => {
+    cameraSplats.forEach(cameraSplat => {
+      cameraSplat.opacity = value ? 1 : 0;
+    });
+  });
+      step2Folder.add(guiOptions, "VisualizeInterpolatedCameras").name("Visualize Interpolated Cameras").onChange((value) => {
+    interpolatedCamerasSplats.forEach(interpolatedCameraSplat => {
+      interpolatedCameraSplat.opacity = value ? 1 : 0;
+    });
+  });
+      // Store controllers globally so they can be accessed from guiOptions
+      window.fixGenerationFOVController = fixGenerationFOVController;
+      // Step 3: Add Scene Prompts
+      const step3Folder = gui.addFolder('3. Add Scene Prompts');
+      step3Folder.add(guiOptions, "inputImagePrompt").name("Input Image Prompt");
+      step3Folder.add(guiOptions, "inputTextPrompt").name("Input Text Prompt");
+      step3Folder.add(guiOptions, "imageIndex", 0, 24, 1).name("Image Index");
+      // Step 4: Generate Your Scene
+      const step4Folder = gui.addFolder('4. Generate Scene');
+      step4Folder.add(guiOptions, "generate").name("Generate!");
+      step4Folder.open();
+      // Step 5: Trajectory Playback (Scrubber)
+      const step5Folder = gui.addFolder('5. Trajectory Playback');
+      step5Folder.add(guiOptions, 'playbackT', 0, 1, 0.001).name('Scrub (0-1)').onChange((value) => {
+        // 首次拖动时记录用户相机状态，便于需要时恢复（可选）
+        if (!userCameraState) {
+          userCameraState = {
+            position: camera.position.clone(),
+            quaternion: camera.quaternion.clone(),
+            fov: camera.fov
+          };
+        }
+        setCameraByScrub(value);
+        updateStatus(`Scrubbing trajectory: t=${value.toFixed(3)}`, cameraParams.length);
+      });
+      step5Folder.open();
+    }
+  }
+  // =========================
+  // File Input (Image Prompt)
+  // =========================
+  const fileInput = document.querySelector("#file-input");
+  fileInput.onchange = (event) => {
+    const files = event.target.files;
+    if (!files || files.length === 0) return;
+    Array.from(files).forEach(file => {
+      const reader = new FileReader();
+      reader.onload = function(e) {
+        console.log("Loaded image:", file.name, e.target.result);
+        // 获取当前Resolution
+        let resolutionStr = guiOptions.Resolution;
+        let [n, h, w] = resolutionStr.split('x').map(Number);
+        // 加载图片
+        const img = new Image();
+        img.onload = function() {
+          window.inputImageResolution = { width: img.width, height: img.height };
+          console.log("Input image resolution:", window.inputImageResolution);
+          // 计算center crop参数
+          let scaleH = h / img.height;
+          let scaleW = w / img.width;
+          let scale = Math.max(scaleH, scaleW);
+          let newW = Math.round(w / scale);
+          let newH = Math.round(h / scale);
+          let sx = Math.floor((img.width - newW) / 2);
+          let sy = Math.floor((img.height - newH) / 2);
+          // 创建canvas进行center crop和resize
+          const canvas = document.createElement('canvas');
+          canvas.width = w;
+          canvas.height = h;
+          const ctx = canvas.getContext('2d');
+          ctx.drawImage(
+            img,
+            sx, sy, newW, newH, // source crop
+            0, 0, w, h          // destination size
+          );
+          // 得到裁剪+缩放后的base64（用于后端）
+          inputImageBase64 = canvas.toDataURL('image/png');
+          // 更新预览为裁剪后的图
+          const previewArea = document.getElementById('image-preview-area');
+          const previewImg = document.getElementById('preview-img');
+          if (previewImg && previewArea) {
+            previewImg.src = inputImageBase64;
+            previewArea.style.display = 'block';
+          }
+          // 记录传给后端的分辨率（已对齐为当前Resolution）
+          window.inputImageResolution = { width: w, height: h };
+          console.log("Cropped and resized image to:", w, h);
+        };
+        img.src = e.target.result;
+      };
+      reader.readAsDataURL(file);
+    });
+  };
+  // =========================
+  // File Input (JSON)
+  // =========================
+  // const jsonInput = document.querySelector("#json-input");
+  // jsonInput.onchange = (event) => {
+  //   const files = event.target.files;
+  //   if (!files || files.length === 0) return;
+  //   const file = files[0];
+  //   const reader = new FileReader();
+  //   reader.onload = function(e) {
+  //     let jsonData;
+  //     try {
+  //       jsonData = JSON.parse(e.target.result);
+  //     } catch (error) {
+  //       alert("JSON parsing error: " + error);
+  //       console.error("JSON parsing error:", error);
+  //       return;
+  //     }
+  //     // 清理所有已有的相机和插值相机
+  //     cameraSplats.forEach(splat => scene.remove(splat));
+  //     cameraSplats.length = 0;
+  //     cameraParams.length = 0;
+  //     interpolatedCamerasSplats.forEach(splat => scene.remove(splat));
+  //     interpolatedCamerasSplats.length = 0;
+  //     try {
+  //       // 兼容不同命名的字段
+  //       const imagePrompt = jsonData.image_prompt || jsonData.imagePrompt || null;
+  //       const textPrompt = jsonData.text_prompt || jsonData.textPrompt || "";
+  //       const cameras = jsonData.cameras || [];
+  //       const resolution = jsonData.resolution || [16, 480, 640];
+  //       const imageIndex = jsonData.image_index || jsonData.imageIndex || 0;
+  //       console.log("Loaded JSON data:", {
+  //         imagePrompt,
+  //         textPrompt,
+  //         cameras: cameras.length,
+  //         resolution,
+  //         imageIndex
+  //       });
+  //       // 处理图像提示
+  //       if (imagePrompt) {
+  //         inputImageBase64 = imagePrompt;
+  //         console.log("Image prompt loaded");
+  //       }
+  //       // 设置文本提示
+  //       guiOptions.inputTextPrompt = textPrompt;
+  //       guiOptions.imageIndex = imageIndex;
+  //       // 处理相机数据
+  //       if (cameras && cameras.length > 0) {
+  //         cameras.forEach(cameraData => {
+  //           // 解析分辨率
+  //           let aspect = 1.0;
+  //           if (Array.isArray(resolution) && resolution.length === 3) {
+  //             aspect = resolution[2] / resolution[1];
+  //           }
+  //           const cam = new THREE.PerspectiveCamera(60, aspect);
+  //           // 设置位置
+  //           if (Array.isArray(cameraData.position) && cameraData.position.length === 3) {
+  //             cam.position.set(cameraData.position[0], cameraData.position[1], cameraData.position[2]);
+  //           }
+  //           // 设置四元数
+  //           if (Array.isArray(cameraData.quaternion) && cameraData.quaternion.length === 4) {
+  //             // 注意：three.js的顺序是 (x, y, z, w)
+  //             cam.quaternion.set(
+  //               cameraData.quaternion[1],
+  //               cameraData.quaternion[2],
+  //               cameraData.quaternion[3],
+  //               cameraData.quaternion[0]
+  //             );
+  //           }
+  //           // 设置FOV和焦距
+  //           if (cameraData.fx && cameraData.fy) {
+  //             // fx, fy: 焦距（像素）
+  //             // 假设分辨率为 [N, H, W]
+  //             // fov = 2 * atan(0.5 * H / fy) * 180 / PI
+  //             // 但原代码用的是 fx
+  //             let fov = 60;
+  //             if (cameraData.fx) {
+  //               fov = 2 * Math.atan(0.5 / cameraData.fx) * 180 / Math.PI;
+  //             }
+  //             cam.fov = fov;
+  //             cam.aspect = cameraData.fx / cameraData.fy;
+  //             cam.updateProjectionMatrix();
+  //           }
+  //           const cameraSplat = createCameraSplat(cam);
+  //           cameraSplats.push(cameraSplat);
+  //           cameraParams.push({
+  //             position: cam.position.clone(),
+  //             quaternion: cam.quaternion.clone(),
+  //             fov: cam.fov,
+  //             aspect: cam.aspect,
+  //           });
+  //           scene.add(cameraSplat);
+  //         });
+  //         console.log(`Loaded ${cameras.length} cameras`);
+  //       }
+  //       // 设置分辨率
+  //       if (Array.isArray(resolution) && resolution.length === 3) {
+  //         guiOptions.Resolution = `${resolution[0]}x${resolution[1]}x${resolution[2]}`;
+  //       }
+  //       alert("JSON loaded");
+  //     } catch (error) {
+  //       alert("JSON data processing error: " + error);
+  //       console.error("JSON data processing error:", error);
+  //     }
+  //   };
+  //   reader.readAsText(file);
+  // };
+  const jsonInput = document.querySelector("#json-input");
+  jsonInput.onchange = (event) => {
+    const files = event.target.files;
+    if (!files || files.length === 0) return;
+    const file = files[0];
+    const reader = new FileReader();
+    reader.onload = function(e) {
+      let jsonData;
+      try {
+        jsonData = JSON.parse(e.target.result);
+      } catch (error) {
+        console.error("JSON parsing error:", error);
+        return;
+      }
+      // 检查是否是只加载轨迹
+      const loadTrajectoryOnly = window.loadTrajectoryOnly;
+      window.loadTrajectoryOnly = false; // 重置标志
+      if (loadTrajectoryOnly) {
+        // 只加载轨迹：清理所有已有的相机和插值相机
+        cameraSplats.forEach(splat => scene.remove(splat));
+        cameraSplats.length = 0;
+        cameraParams.length = 0;
+        interpolatedCamerasSplats.forEach(splat => scene.remove(splat));
+        interpolatedCamerasSplats.length = 0;
+      } else {
+        // 加载完整JSON：清理所有已有的相���和插值相机
+        cameraSplats.forEach(splat => scene.remove(splat));
+        cameraSplats.length = 0;
+        cameraParams.length = 0;
+        interpolatedCamerasSplats.forEach(splat => scene.remove(splat));
+        interpolatedCamerasSplats.length = 0;
+      }
+      try {
+        // 兼容不同命名的字段
+        const imagePrompt = jsonData.image_prompt || jsonData.imagePrompt || null;
+        const textPrompt = jsonData.text_prompt || jsonData.textPrompt || "";
+        const cameras = jsonData.cameras || [];
+        const resolution = jsonData.resolution || [16, 480, 640];
+        const imageIndex = jsonData.image_index || jsonData.imageIndex || 0;
+        console.log("Loaded JSON data:", {
+          imagePrompt,
+          textPrompt,
+          cameras: cameras.length,
+          resolution,
+          imageIndex
+        });
+        // 处理图像提示（仅在非轨迹加载模式下）
+        if (!loadTrajectoryOnly && imagePrompt) {
+          inputImageBase64 = imagePrompt;
+          console.log("Image prompt loaded");
+        }
+        // 设置文本提示（仅在非轨迹加载模式下）
+        if (!loadTrajectoryOnly) {
+          guiOptions.inputTextPrompt = textPrompt;
+          guiOptions.imageIndex = imageIndex;
+        }
+        // 处理相机数据
+        if (cameras && cameras.length > 0) {
+          let jsonFirstCamera = null;
+          let jsonFirstPosition = null;
+          let jsonFirstQuaternion = null;
+          // 首先获取JSON中第一个相机的位置和四元数
+          if (loadTrajectoryOnly && cameras.length > 0) {
+            const firstCameraData = cameras[0];
+            if (Array.isArray(firstCameraData.position) && firstCameraData.position.length === 3) {
+              jsonFirstPosition = new THREE.Vector3(
+                firstCameraData.position[0],
+                firstCameraData.position[1],
+                firstCameraData.position[2]
+              );
+            }
+            if (Array.isArray(firstCameraData.quaternion) && firstCameraData.quaternion.length === 4) {
+              jsonFirstQuaternion = new THREE.Quaternion(
+                firstCameraData.quaternion[1],
+                firstCameraData.quaternion[2],
+                firstCameraData.quaternion[3],
+                firstCameraData.quaternion[0]
+              );
+            }
+          }
+          cameras.forEach((cameraData, index) => {
+            // 解析分辨率
+            let aspect = 1.0;
+            if (Array.isArray(resolution) && resolution.length === 3) {
+              aspect = resolution[2] / resolution[1];
+            } else {
+              aspect = guiOptions.Resolution.split('x')[2] / guiOptions.Resolution.split('x')[1];
+            }
+            // 根据加载模式决定FOV
+            let fov = 60;
+            if (loadTrajectoryOnly) {
+              // 轨迹加载：使用GUI中设定的FOV
+              fov = guiOptions.FOV;
+            } else {
+              // 完整JSON加载：使用JSON中的FOV或默认值
+              if (cameraData.fx && cameraData.fy) {
+                fov = 2 * Math.atan(0.5 / cameraData.fx) * 180 / Math.PI;
+              }
+            }
+            const cam = new THREE.PerspectiveCamera(fov, aspect);
+            // 设置位置和四元数
+            if (Array.isArray(cameraData.position) && cameraData.position.length === 3) {
+              cam.position.set(cameraData.position[0], cameraData.position[1], cameraData.position[2]);
+            }
+            if (Array.isArray(cameraData.quaternion) && cameraData.quaternion.length === 4) {
+              // 注意：three.js的顺序是 (x, y, z, w)
+              cam.quaternion.set(
+                cameraData.quaternion[1],
+                cameraData.quaternion[2],
+                cameraData.quaternion[3],
+                cameraData.quaternion[0]
+              );
+            }
+            // 轨迹加载：第一个相机强制设置为原点
+            // if (loadTrajectoryOnly && index === 0) {
+            //   cam.position.set(0, 0, 0);
+            //   cam.quaternion.set(0, 0, 0, 1);
+            // }
+            // 轨迹加载：归一化到相对于固定FOV相机的位置
+            if (loadTrajectoryOnly && jsonFirstPosition && jsonFirstQuaternion) {
+              // 参考Python代码的归一化逻辑
+              // 1. 计算JSON第一个相机的c2w矩阵
+              const jsonFirstC2W = new THREE.Matrix4();
+              jsonFirstC2W.compose(jsonFirstPosition, jsonFirstQuaternion, new THREE.Vector3(1, 1, 1));
+              // 2. 计算当前相机的c2w矩阵
+              const currentC2W = new THREE.Matrix4();
+              currentC2W.compose(cam.position, cam.quaternion, new THREE.Vector3(1, 1, 1));
+              // 3. 计算相对变换：ref_w2c @ current_c2w
+              const refW2C = jsonFirstC2W.clone().invert();
+              const relativeTransform = refW2C.clone().multiply(currentC2W);
+              // 4. 将相对变换应用到原点相机上（作为参考）
+              const fixedC2W = new THREE.Matrix4();
+              fixedC2W.compose(new THREE.Vector3(0, 0, 0), new THREE.Quaternion(0, 0, 0, 1), new THREE.Vector3(1, 1, 1));
+              const newTransform = fixedC2W.clone().multiply(relativeTransform);
+              // 5. 提取新的位置和旋转
+              const newPosition = new THREE.Vector3();
+              const newQuaternion = new THREE.Quaternion();
+              const newScale = new THREE.Vector3();
+              newTransform.decompose(newPosition, newQuaternion, newScale);
+              cam.position.copy(newPosition);
+              cam.quaternion.copy(newQuaternion);
+            }
+            // 设置FOV和焦距（仅在非轨迹加载模式下）
+            if (!loadTrajectoryOnly && cameraData.fx && cameraData.fy) {
+              cam.fov = fov;
+              cam.aspect = cameraData.fx / cameraData.fy;
+              cam.updateProjectionMatrix();
+            } else if (loadTrajectoryOnly) {
+              // 轨迹加载：使用GUI中设定的FOV和aspect
+              cam.fov = fov;
+              cam.aspect = aspect;
+              cam.updateProjectionMatrix();
+            }
+            const cameraSplat = createCameraSplat(cam);
+            cameraSplats.push(cameraSplat);
+            cameraParams.push({
+              position: cam.position.clone(),
+              quaternion: cam.quaternion.clone(),
+              fov: cam.fov,
+              aspect: cam.aspect,
+            });
+            scene.add(cameraSplat);
+          });
+          console.log(cameraParams);
+        }
+        // 设置分辨率（仅在非轨迹加载模式下）
+        if (!loadTrajectoryOnly && Array.isArray(resolution) && resolution.length === 3) {
+          guiOptions.Resolution = `${resolution[0]}x${resolution[1]}x${resolution[2]}`;
+        }
+        // 显示成功消息
+        if (loadTrajectoryOnly) {
+          updateStatus(`Trajectory loaded: ${cameras.length} cameras`, cameraParams.length);
+        } else {
+        }
+      } catch (error) {
+        console.error("JSON data processing error:", error);
+      }
+    };
+    reader.readAsText(file);
+  };
+  // =========================
+  // Keyboard Controls
+  // =========================
+  document.addEventListener('keypress', (event) => {
+    if (event.code === 'Space') {
+      if (!fixGenerationFOV) {
+        updateStatus('Please fix Generation FOV first', cameraParams.length);
+        return;
+      }
+      // 记录当前相机的pose
+      const new_camera = camera.clone();
+      new_camera.fov = guiOptions.FOV;
+      new_camera.aspect = guiOptions.Resolution.split('x')[2] / guiOptions.Resolution.split('x')[1];
+      new_camera.updateProjectionMatrix();
+      const cameraSplat = createCameraSplat(new_camera);
+      cameraSplats.push(cameraSplat);
+      cameraParams.push({
+        position: new_camera.position.clone(),
+        quaternion: new_camera.quaternion.clone(),
+        fov: new_camera.fov,
+        aspect: new_camera.aspect,
+      });
+      scene.add(cameraSplat);
+      updateStatus(`Camera ${cameraParams.length} recorded. Press Space for more or Generate!`, cameraParams.length);
+      console.log(new_camera.getFocalLength());
+    }
+  });
+  // =========================
+  // Scene Initialization
+  // =========================
+  // Initialize status
+  updateStatus('FlashWorld initialized. Configure settings to begin.', 0);
+  // Add cube splat to the scene
+  let instructionSplat = createCubeSplat(0.25, [1, 1, 1]);
+  instructionSplat.position.set(0, 0, -1);
+  scene.add(instructionSplat);
+  console.log('Cube splat added to scene');
+  // Handle window resize
+  window.addEventListener('resize', () => {
+    console.log('Window resized, updating canvas...');
+    // Update canvas size based on current resolution
+    updateCanvasSize();
+  });
+  // =========================
+  // Animation Loop
+  // =========================
+  let lastTime = null;
+  renderer.setAnimationLoop(function animate(time) {
+    const deltaTime = time - (lastTime || time);
+      lastTime = time;
+    // Rotate the cube splat
+    if (instructionSplat) {
+      // instructionSplat.rotation.x += deltaTime / 4000; // 绕X轴旋转
+      instructionSplat.rotation.y += deltaTime / 5000; // 绕Y轴旋转
+      instructionSplat.rotation.z += deltaTime / 6000; // 绕Z轴旋转
+    }
+    // No active playback loop; scrubber directly sets camera
+    controls.update(camera);
+    renderer.render(scene, camera);
+  });
+</script>
+</body>
+</html>

models/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .autoencoder_kl_wan import AutoencoderKLWan
+from .transformer_wan import WanTransformer3DModel
+from .reconstruction_model import WANDecoderPixelAligned3DGSReconstructionModel
+__all__ = ["AutoencoderKLWan", "WanTransformer3DModel", "WANDecoderPixelAligned3DGSReconstructionModel"]

models/autoencoder_kl_wan.py ADDED Viewed

	@@ -0,0 +1,1467 @@

+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin
+from diffusers.utils import logging
+from diffusers.utils.accelerate_utils import apply_forward_hook
+from diffusers.models.activations import get_activation
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution
+import einops
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+CACHE_T = 2
+class AvgDown3D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+        assert in_channels * self.factor % out_channels == 0
+        self.group_size = in_channels * self.factor // out_channels
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not ((x.shape[2] == 1 and self.group_size >= self.factor) or self.factor_t == 1):
+            pad_t = (self.factor_t - x.shape[2] % self.factor_t)
+            pad = (0, 0, 0, 0, pad_t, 0)
+            x = F.pad(x, pad)
+            B, C, T, H, W = x.shape
+            x = x.view(
+                B,
+                C,
+                T // self.factor_t,
+                self.factor_t,
+                H // self.factor_s,
+                self.factor_s,
+                W // self.factor_s,
+                self.factor_s,
+            )
+            x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
+            x = x.view(
+                B,
+                C * self.factor,
+                T // self.factor_t,
+                H // self.factor_s,
+                W // self.factor_s,
+            )
+            x = x.view(
+                B,
+                self.out_channels,
+                self.group_size,
+                T // self.factor_t,
+                H // self.factor_s,
+                W // self.factor_s,
+            )
+            x = x.mean(dim=2)
+            return x
+        else:
+            # print(1)
+            pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
+            pad = (0, 0, 0, 0, pad_t, 0)
+            B, C, T, H, W = x.shape
+            x = x.view(
+                B,
+                C,
+                T,
+                1,
+                H // self.factor_s,
+                self.factor_s,
+                W // self.factor_s,
+                self.factor_s,
+            )
+            x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
+            x = x.view(
+                B,
+                C * self.factor // self.factor_t,
+                T,
+                H // self.factor_s,
+                W // self.factor_s,
+            )
+            x = x.view(
+                B,
+                self.out_channels,
+                self.group_size // self.factor_t,
+                T,
+                H // self.factor_s,
+                W // self.factor_s,
+            )
+            # 因为pad的是0，所以按理说除以factor_t后值才是对的
+            x = x.mean(dim=2) / (pad_t + 1)
+            return x
+class DupUp3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+        assert out_channels * self.factor % in_channels == 0
+        self.repeats = out_channels * self.factor // in_channels
+    def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
+        if not (first_chunk and x.shape[2] == 1):
+            x = x.repeat_interleave(self.repeats, dim=1)
+            x = x.view(
+                x.size(0),
+                self.out_channels,
+                self.factor_t,
+                self.factor_s,
+                self.factor_s,
+                x.size(2),
+                x.size(3),
+                x.size(4),
+            )
+            x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
+            x = x.view(
+                x.size(0),
+                self.out_channels,
+                x.size(2) * self.factor_t,
+                x.size(4) * self.factor_s,
+                x.size(6) * self.factor_s,
+            )
+            if first_chunk:
+                x = x[:, :, self.factor_t - 1:, :, :]
+            return x
+        else:
+            # print(1)
+            x = x.repeat_interleave(self.repeats // self.factor_t, dim=1)
+            x = x.view(
+                x.size(0),
+                self.out_channels,
+                1,
+                self.factor_s,
+                self.factor_s,
+                x.size(2),
+                x.size(3),
+                x.size(4),
+            )
+            x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
+            x = x.view(
+                x.size(0),
+                self.out_channels,
+                x.size(2),
+                x.size(4) * self.factor_s,
+                x.size(6) * self.factor_s,
+            )
+            return x
+class WanCausalConv3d(nn.Conv3d):
+    r"""
+    A custom 3D causal convolution layer with feature caching support.
+    This layer extends the standard Conv3D layer by ensuring causality in the time dimension and handling feature
+    caching for efficient inference.
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to all three sides of the input. Default: 0
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        padding: Union[int, Tuple[int, int, int]] = 0,
+    ) -> None:
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+        # Set up causal padding
+        self._padding = (self.padding[2], self.padding[2], self.padding[1], self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        if any(padding):
+            x = F.pad(x, padding)
+        # print(x.shape)
+        return super().forward(x)
+class WanRMS_norm(nn.Module):
+    r"""
+    A custom RMS normalization layer.
+    Args:
+        dim (int): The number of dimensions to normalize over.
+        channel_first (bool, optional): Whether the input tensor has channels as the first dimension.
+            Default is True.
+        images (bool, optional): Whether the input represents image data. Default is True.
+        bias (bool, optional): Whether to include a learnable bias term. Default is False.
+    """
+    def __init__(self, dim: int, channel_first: bool = True, images: bool = True, weight: bool = True, bias: bool = False) -> None:
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape)) if weight else 1.0
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+    def forward(self, x):
+        return F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma + self.bias
+class WanUpsample(nn.Upsample):
+    r"""
+    Perform upsampling while ensuring the output tensor has the same data type as the input.
+    Args:
+        x (torch.Tensor): Input tensor to be upsampled.
+    Returns:
+        torch.Tensor: Upsampled tensor with the same data type as the input.
+    """
+    def forward(self, x):
+        return super().forward(x.float()).type_as(x)
+class WanResample(nn.Module):
+    r"""
+    A custom resampling module for 2D and 3D data.
+    Args:
+        dim (int): The number of input/output channels.
+        mode (str): The resampling mode. Must be one of:
+            - 'none': No resampling (identity operation).
+            - 'upsample2d': 2D upsampling with nearest-exact interpolation and convolution.
+            - 'upsample3d': 3D upsampling with nearest-exact interpolation, convolution, and causal 3D convolution.
+            - 'downsample2d': 2D downsampling with zero-padding and convolution.
+            - 'downsample3d': 3D downsampling with zero-padding, convolution, and causal 3D convolution.
+    """
+    def __init__(self, dim: int, mode: str, upsample_out_dim: int = None) -> None:
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # default to dim //2
+        if upsample_out_dim is None:
+            upsample_out_dim = dim // 2
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                WanUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, upsample_out_dim, 3, padding=1)
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                WanUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, upsample_out_dim, 3, padding=1)
+            )
+            self.time_conv = WanCausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = WanCausalConv3d(dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat(
+                            [feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2
+                        )
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
+                        cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2)
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        x = self.resample(x)
+        x = x.view(b, t, x.size(1), x.size(2), x.size(3)).permute(0, 2, 1, 3, 4)
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+class WanResidualBlock(nn.Module):
+    r"""
+    A custom residual block module.
+    Args:
+        in_dim (int): Number of input channels.
+        out_dim (int): Number of output channels.
+        dropout (float, optional): Dropout rate for the dropout layer. Default is 0.0.
+        non_linearity (str, optional): Type of non-linearity to use. Default is "silu".
+    """
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        dropout: float = 0.0,
+        non_linearity: str = "silu",
+    ) -> None:
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.nonlinearity = get_activation(non_linearity)
+        # layers
+        self.norm1 = WanRMS_norm(in_dim, images=False)
+        self.conv1 = WanCausalConv3d(in_dim, out_dim, 3, padding=1)
+        self.norm2 = WanRMS_norm(out_dim, images=False)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = WanCausalConv3d(out_dim, out_dim, 3, padding=1)
+        self.conv_shortcut = WanCausalConv3d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        # Apply shortcut connection
+        h = self.conv_shortcut(x)
+        # First normalization and activation
+        x = self.norm1(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        # Second normalization and activation
+        x = self.norm2(x)
+        x = self.nonlinearity(x)
+        # Dropout
+        x = self.dropout(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv2(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv2(x)
+        # Add residual connection
+        return h.add_(x)
+class WanAttentionBlock(nn.Module):
+    r"""
+    Causal self-attention with a single head.
+    Args:
+        dim (int): The number of channels in the input tensor.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        # layers
+        self.norm = WanRMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+    def forward(self, x):
+        identity = x
+        batch_size, channels, time, height, width = x.size()
+        x = x.permute(0, 2, 1, 3, 4).reshape(batch_size * time, channels, height, width)
+        x = self.norm(x)
+        # compute query, key, value
+        qkv = self.to_qkv(x)
+        qkv = qkv.reshape(batch_size * time, 1, channels * 3, -1)
+        qkv = qkv.permute(0, 1, 3, 2).contiguous()
+        q, k, v = qkv.chunk(3, dim=-1)
+        # apply attention
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = x.squeeze(1).permute(0, 2, 1).reshape(batch_size * time, channels, height, width)
+        # output projection
+        x = self.proj(x)
+        # Reshape back: [(b*t), c, h, w] -> [b, c, t, h, w]
+        x = x.view(batch_size, time, channels, height, width)
+        x = x.permute(0, 2, 1, 3, 4)
+        return identity.add_(x)
+class WanMidBlock(nn.Module):
+    """
+    Middle block for WanVAE encoder and decoder.
+    Args:
+        dim (int): Number of input/output channels.
+        dropout (float): Dropout rate.
+        non_linearity (str): Type of non-linearity to use.
+    """
+    def __init__(self, dim: int, dropout: float = 0.0, non_linearity: str = "silu", num_layers: int = 1):
+        super().__init__()
+        self.dim = dim
+        # Create the components
+        resnets = [WanResidualBlock(dim, dim, dropout, non_linearity)]
+        attentions = []
+        for _ in range(num_layers):
+            attentions.append(WanAttentionBlock(dim))
+            resnets.append(WanResidualBlock(dim, dim, dropout, non_linearity))
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.gradient_checkpointing = False
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        # First residual block
+        x = self.resnets[0](x, feat_cache, feat_idx)
+        # Process through attention and residual blocks
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                x = attn(x)
+            x = resnet(x, feat_cache, feat_idx)
+        return x
+class WanResidualDownBlock(nn.Module):
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 dropout,
+                 num_res_blocks,
+                 temperal_downsample=False,
+                 down_flag=False):
+        super().__init__()
+        # Shortcut path with downsample
+        self.avg_shortcut = AvgDown3D(
+            in_dim,
+            out_dim,
+            factor_t=2 if temperal_downsample else 1,
+            factor_s=2 if down_flag else 1,
+        )
+        # Main path with residual blocks and downsample
+        resnets = []
+        for _ in range(num_res_blocks):
+            resnets.append(WanResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+        self.resnets = nn.ModuleList(resnets)
+        # Add the final downsample block
+        if down_flag:
+            mode = "downsample3d" if temperal_downsample else "downsample2d"
+            self.downsampler = WanResample(out_dim, mode=mode)
+        else:
+            self.downsampler = None
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        x_copy = x.clone()
+        for resnet in self.resnets:
+            x = resnet(x, feat_cache, feat_idx)
+        if self.downsampler is not None:
+            x = self.downsampler(x, feat_cache, feat_idx)
+        return self.avg_shortcut(x_copy).add_(x)
+class WanEncoder3d(nn.Module):
+    r"""
+    A 3D encoder module.
+    Args:
+        dim (int): The base number of channels in the first layer.
+        z_dim (int): The dimensionality of the latent space.
+        dim_mult (list of int): Multipliers for the number of channels in each block.
+        num_res_blocks (int): Number of residual blocks in each block.
+        attn_scales (list of float): Scales at which to apply attention mechanisms.
+        temperal_downsample (list of bool): Whether to downsample temporally in each block.
+        dropout (float): Dropout rate for the dropout layers.
+        non_linearity (str): Type of non-linearity to use.
+    """
+    def __init__(
+        self,
+        in_channels: int = 3,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+        non_linearity: str = "silu",
+        is_residual: bool = False, # wan 2.2 vae use a residual downblock
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.nonlinearity = get_activation(non_linearity)
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv_in = WanCausalConv3d(in_channels, dims[0], 3, padding=1)
+        # downsample blocks
+        self.down_blocks = nn.ModuleList([])
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if is_residual:
+                self.down_blocks.append(
+                    WanResidualDownBlock(
+                        in_dim,
+                        out_dim,
+                        dropout,
+                        num_res_blocks,
+                        temperal_downsample=temperal_downsample[i] if i != len(dim_mult) - 1 else False,
+                        down_flag=i != len(dim_mult) - 1,
+                        )
+                        )
+            else:
+                for _ in range(num_res_blocks):
+                    self.down_blocks.append(WanResidualBlock(in_dim, out_dim, dropout))
+                    if scale in attn_scales:
+                        self.down_blocks.append(WanAttentionBlock(out_dim))
+                    in_dim = out_dim
+                # downsample block
+                if i != len(dim_mult) - 1:
+                    mode = "downsample3d" if temperal_downsample[i] else "downsample2d"
+                    self.down_blocks.append(WanResample(out_dim, mode=mode))
+                    scale /= 2.0
+        # middle blocks
+        self.mid_block = WanMidBlock(out_dim, dropout, non_linearity, num_layers=1)
+        # output blocks
+        self.norm_out = WanRMS_norm(out_dim, images=False)
+        self.conv_out = WanCausalConv3d(out_dim, z_dim, 3, padding=1)
+        self.gradient_checkpointing = False
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        assert x.shape[2] == 1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_in(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_in(x)
+        ## downsamples
+        for layer in self.down_blocks:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## middle
+        x = self.mid_block(x, feat_cache, feat_idx)
+        ## head
+        x = self.norm_out(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_out(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_out(x)
+        return x
+class WanResidualUpBlock(nn.Module):
+    """
+    A block that handles upsampling for the WanVAE decoder.
+    Args:
+        in_dim (int): Input dimension
+        out_dim (int): Output dimension
+        num_res_blocks (int): Number of residual blocks
+        dropout (float): Dropout rate
+        temperal_upsample (bool): Whether to upsample on temporal dimension
+        up_flag (bool): Whether to upsample or not
+        non_linearity (str): Type of non-linearity to use
+    """
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        num_res_blocks: int,
+        dropout: float = 0.0,
+        temperal_upsample: bool = False,
+        up_flag: bool = False,
+        non_linearity: str = "silu",
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        if up_flag:
+            self.avg_shortcut = DupUp3D(
+                in_dim,
+                out_dim,
+                factor_t=2 if temperal_upsample else 1,
+                factor_s=2,
+            )
+        else:
+            self.avg_shortcut = None
+        # create residual blocks
+        resnets = []
+        current_dim = in_dim
+        for _ in range(num_res_blocks + 1):
+            resnets.append(WanResidualBlock(current_dim, out_dim, dropout, non_linearity))
+            current_dim = out_dim
+        self.resnets = nn.ModuleList(resnets)
+        # Add upsampling layer if needed
+        if up_flag:
+            upsample_mode = "upsample3d" if temperal_upsample else "upsample2d"
+            self.upsampler = WanResample(out_dim, mode=upsample_mode, upsample_out_dim=out_dim)
+        else:
+            self.upsampler = None
+        self.gradient_checkpointing = False
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        """
+        Forward pass through the upsampling block.
+        Args:
+            x (torch.Tensor): Input tensor
+            feat_cache (list, optional): Feature cache for causal convolutions
+            feat_idx (list, optional): Feature index for cache management
+        Returns:
+            torch.Tensor: Output tensor
+        """
+        x_copy = x.clone()
+        for resnet in self.resnets:
+            if feat_cache is not None:
+                x = resnet(x, feat_cache, feat_idx)
+            else:
+                x = resnet(x)
+        if self.upsampler is not None:
+            if feat_cache is not None:
+                x = self.upsampler(x, feat_cache, feat_idx)
+            else:
+                x = self.upsampler(x)
+        if self.avg_shortcut is not None:
+            # print(x.shape, x_copy.shape, self.avg_shortcut(x_copy, first_chunk=first_chunk).shape)
+            x = x + self.avg_shortcut(x_copy, first_chunk=first_chunk)
+        return x
+class WanUpBlock(nn.Module):
+    """
+    A block that handles upsampling for the WanVAE decoder.
+    Args:
+        in_dim (int): Input dimension
+        out_dim (int): Output dimension
+        num_res_blocks (int): Number of residual blocks
+        dropout (float): Dropout rate
+        upsample_mode (str, optional): Mode for upsampling ('upsample2d' or 'upsample3d')
+        non_linearity (str): Type of non-linearity to use
+    """
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        num_res_blocks: int,
+        dropout: float = 0.0,
+        upsample_mode: Optional[str] = None,
+        non_linearity: str = "silu",
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # Create layers list
+        resnets = []
+        # Add residual blocks and attention if needed
+        current_dim = in_dim
+        for _ in range(num_res_blocks + 1):
+            resnets.append(WanResidualBlock(current_dim, out_dim, dropout, non_linearity))
+            current_dim = out_dim
+        self.resnets = nn.ModuleList(resnets)
+        # Add upsampling layer if needed
+        self.upsamplers = None
+        if upsample_mode is not None:
+            self.upsamplers = nn.ModuleList([WanResample(out_dim, mode=upsample_mode)])
+        self.gradient_checkpointing = False
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=None):
+        """
+        Forward pass through the upsampling block.
+        Args:
+            x (torch.Tensor): Input tensor
+            feat_cache (list, optional): Feature cache for causal convolutions
+            feat_idx (list, optional): Feature index for cache management
+        Returns:
+            torch.Tensor: Output tensor
+        """
+        for resnet in self.resnets:
+            if feat_cache is not None:
+                x = resnet(x, feat_cache, feat_idx)
+            else:
+                x = resnet(x)
+        if self.upsamplers is not None:
+            if feat_cache is not None:
+                x = self.upsamplers[0](x, feat_cache, feat_idx)
+            else:
+                x = self.upsamplers[0](x)
+        return x
+class WanDecoder3d(nn.Module):
+    r"""
+    A 3D decoder module.
+    Args:
+        dim (int): The base number of channels in the first layer.
+        z_dim (int): The dimensionality of the latent space.
+        dim_mult (list of int): Multipliers for the number of channels in each block.
+        num_res_blocks (int): Number of residual blocks in each block.
+        attn_scales (list of float): Scales at which to apply attention mechanisms.
+        temperal_upsample (list of bool): Whether to upsample temporally in each block.
+        dropout (float): Dropout rate for the dropout layers.
+        non_linearity (str): Type of non-linearity to use.
+    """
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+        non_linearity: str = "silu",
+        out_channels: int = 3,
+        is_residual: bool = False,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        self.nonlinearity = get_activation(non_linearity)
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        # init block
+        self.conv_in = WanCausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.mid_block = WanMidBlock(dims[0], dropout, non_linearity, num_layers=1)
+        # upsample blocks
+        self.up_blocks = nn.ModuleList([])
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i > 0 and not is_residual:
+                # wan vae 2.1
+                in_dim = in_dim // 2
+            # determine if we need upsampling
+            up_flag = i != len(dim_mult) - 1
+            # determine upsampling mode, if not upsampling, set to None
+            upsample_mode = None
+            if up_flag and temperal_upsample[i]:
+                upsample_mode = "upsample3d"
+            elif up_flag:
+                upsample_mode = "upsample2d"
+            # Create and add the upsampling block
+            if is_residual:
+                up_block = WanResidualUpBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    num_res_blocks=num_res_blocks,
+                    dropout=dropout,
+                    temperal_upsample=temperal_upsample[i] if up_flag else False,
+                    up_flag= up_flag,
+                    non_linearity=non_linearity,
+                )
+            else:
+                up_block = WanUpBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    num_res_blocks=num_res_blocks,
+                    dropout=dropout,
+                    upsample_mode=upsample_mode,
+                    non_linearity=non_linearity,
+                )
+            self.up_blocks.append(up_block)
+        # output blocks
+        self.norm_out = WanRMS_norm(out_dim, images=False)
+        self.conv_out = WanCausalConv3d(out_dim, out_channels, 3, padding=1)
+        self.gradient_checkpointing = False
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        assert x.shape[2] == 1
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_in(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_in(x)
+        ## middle
+        x = self.mid_block(x, feat_cache, feat_idx)
+        ## upsamples
+        for up_block in self.up_blocks:
+            x = up_block(x, feat_cache, feat_idx, first_chunk = first_chunk)
+        ## head
+        x = self.norm_out(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_out(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_out(x)
+        return x
+def patchify(x, patch_size):
+    # YiYi TODO: refactor this
+    from einops import rearrange
+    if patch_size == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b c f (h q) (w r) -> b (c r q) f h w",
+            q=patch_size,
+            r=patch_size,
+        )
+    else:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    return x
+def unpatchify(x, patch_size):
+    # YiYi TODO: refactor this
+    from einops import rearrange
+    if patch_size == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b (c r q) f h w -> b c f (h q) (w r)",
+            q=patch_size,
+            r=patch_size,
+        )
+    return x
+class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    r"""
+    A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos.
+    Introduced in [Wan 2.1].
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    """
+    _supports_gradient_checkpointing = False
+    @register_to_config
+    def __init__(
+        self,
+        base_dim: int = 96,
+        decoder_base_dim: Optional[int] = None,
+        z_dim: int = 16,
+        dim_mult: Tuple[int] = [1, 2, 4, 4],
+        num_res_blocks: int = 2,
+        attn_scales: List[float] = [],
+        temperal_downsample: List[bool] = [False, True, True],
+        dropout: float = 0.0,
+        latents_mean: List[float] = [
+            -0.7571,
+            -0.7089,
+            -0.9113,
+            0.1075,
+            -0.1745,
+            0.9653,
+            -0.1517,
+            1.5508,
+            0.4134,
+            -0.0715,
+            0.5517,
+            -0.3632,
+            -0.1922,
+            -0.9497,
+            0.2503,
+            -0.2921,
+        ],
+        latents_std: List[float] = [
+            2.8184,
+            1.4541,
+            2.3275,
+            2.6558,
+            1.2196,
+            1.7708,
+            2.6052,
+            2.0743,
+            3.2687,
+            2.1526,
+            2.8652,
+            1.5579,
+            1.6382,
+            1.1253,
+            2.8251,
+            1.9160,
+        ],
+        is_residual: bool = False,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        patch_size: Optional[int] = None,
+        scale_factor_temporal: Optional[int] = 4,
+        scale_factor_spatial: Optional[int] = 8,
+        clip_output: bool = True,
+    ) -> None:
+        super().__init__()
+        self.z_dim = z_dim
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        if decoder_base_dim is None:
+            decoder_base_dim = base_dim
+        self.encoder = WanEncoder3d(
+            in_channels=in_channels, dim=base_dim, z_dim=z_dim * 2, dim_mult=dim_mult, num_res_blocks=num_res_blocks, attn_scales=attn_scales, temperal_downsample=temperal_downsample, dropout=dropout, is_residual=is_residual
+        )
+        self.quant_conv = WanCausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.post_quant_conv = WanCausalConv3d(z_dim, z_dim, 1)
+        self.decoder = WanDecoder3d(
+            dim=decoder_base_dim, z_dim=z_dim, dim_mult=dim_mult, num_res_blocks=num_res_blocks, attn_scales=attn_scales, temperal_upsample=self.temperal_upsample, dropout=dropout, out_channels=out_channels, is_residual=is_residual
+        )
+        self.spatial_compression_ratio = 2 ** len(self.temperal_downsample)
+        # When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
+        # to perform decoding of a single video latent at a time.
+        self.use_slicing = False
+        # When decoding spatially large video latents, the memory requirement is very high. By breaking the video latent
+        # frames spatially into smaller tiles and performing multiple forward passes for decoding, and then blending the
+        # intermediate tiles together, the memory requirement can be lowered.
+        self.use_tiling = False
+        # The minimal tile height and width for spatial tiling to be used
+        self.tile_sample_min_height = 256
+        self.tile_sample_min_width = 256
+        # The minimal distance between two spatial tiles
+        self.tile_sample_stride_height = 192
+        self.tile_sample_stride_width = 192
+        # Precompute and cache conv counts for encoder and decoder for clear_cache speedup
+        self._cached_conv_counts = {
+            "decoder": sum(isinstance(m, WanCausalConv3d) for m in self.decoder.modules())
+            if self.decoder is not None
+            else 0,
+            "encoder": sum(isinstance(m, WanCausalConv3d) for m in self.encoder.modules())
+            if self.encoder is not None
+            else 0,
+        }
+    def enable_tiling(
+        self,
+        tile_sample_min_height: Optional[int] = None,
+        tile_sample_min_width: Optional[int] = None,
+        tile_sample_stride_height: Optional[float] = None,
+        tile_sample_stride_width: Optional[float] = None,
+    ) -> None:
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        Args:
+            tile_sample_min_height (`int`, *optional*):
+                The minimum height required for a sample to be separated into tiles across the height dimension.
+            tile_sample_min_width (`int`, *optional*):
+                The minimum width required for a sample to be separated into tiles across the width dimension.
+            tile_sample_stride_height (`int`, *optional*):
+                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
+                no tiling artifacts produced across the height dimension.
+            tile_sample_stride_width (`int`, *optional*):
+                The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
+                artifacts produced across the width dimension.
+        """
+        self.use_tiling = True
+        self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
+        self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
+        self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
+        self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
+    def disable_tiling(self) -> None:
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_tiling = False
+    def enable_slicing(self) -> None:
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+    def disable_slicing(self) -> None:
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+    def clear_cache(self):
+        # Use cached conv counts for decoder and encoder to avoid re-iterating modules each call
+        self._conv_num = self._cached_conv_counts["decoder"]
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = self._cached_conv_counts["encoder"]
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+    def _encode(self, x: torch.Tensor):
+        _, _, num_frame, height, width = x.shape
+        if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
+            return self.tiled_encode(x)
+        self.clear_cache()
+        if self.config.patch_size is not None:
+            x = patchify(x, patch_size=self.config.patch_size)
+        iter_ = 1 + (num_frame - 1) // 4
+        self._enc_feat_map = None if iter_ == 1 else self._enc_feat_map
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(x[:, :, :1, :, :], feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        enc = self.quant_conv(out)
+        self.clear_cache()
+        return enc
+    @apply_forward_hook
+    def encode(
+        self, x: torch.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        r"""
+        Encode a batch of images into latents.
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+                The latent representations of the encoded videos. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self._encode(x)
+        posterior = DiagonalGaussianDistribution(h)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def _decode(self, z: torch.Tensor, return_dict: bool = True):
+        _, _, num_frame, height, width = z.shape
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
+            return self.tiled_decode(z, return_dict=return_dict)
+        self.clear_cache()
+        self._feat_map = None if num_frame == 1 else self._feat_map
+        x = self.post_quant_conv(z)
+        for i in range(num_frame):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx, first_chunk=True)
+            else:
+                out_ = self.decoder(x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
+                out = torch.cat([out, out_], 2)
+        if self.config.clip_output:
+            out = torch.clamp(out, min=-1.0, max=1.0)
+        if self.config.patch_size is not None:
+            out = unpatchify(out, patch_size=self.config.patch_size)
+        self.clear_cache()
+        if not return_dict:
+            return (out,)
+        return DecoderOutput(sample=out)
+    @apply_forward_hook
+    def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images.
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
+                y / blend_extent
+            )
+        return b
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
+                x / blend_extent
+            )
+        return b
+    def tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
+        r"""Encode a batch of images using a tiled encoder.
+        Args:
+            x (`torch.Tensor`): Input batch of videos.
+        Returns:
+            `torch.Tensor`:
+                The latent representation of the encoded videos.
+        """
+        _, _, num_frames, height, width = x.shape
+        latent_height = height // self.spatial_compression_ratio
+        latent_width = width // self.spatial_compression_ratio
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
+        tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
+        blend_height = tile_latent_min_height - tile_latent_stride_height
+        blend_width = tile_latent_min_width - tile_latent_stride_width
+        # Split x into overlapping tiles and encode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, self.tile_sample_stride_height):
+            row = []
+            for j in range(0, width, self.tile_sample_stride_width):
+                self.clear_cache()
+                time = []
+                frame_range = 1 + (num_frames - 1) // 4
+                for k in range(frame_range):
+                    self._enc_conv_idx = [0]
+                    if k == 0:
+                        tile = x[:, :, :1, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
+                    else:
+                        tile = x[
+                            :,
+                            :,
+                            1 + 4 * (k - 1) : 1 + 4 * k,
+                            i : i + self.tile_sample_min_height,
+                            j : j + self.tile_sample_min_width,
+                        ]
+                    tile = self.encoder(tile, feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
+                    tile = self.quant_conv(tile)
+                    time.append(tile)
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+        self.clear_cache()
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_width)
+                result_row.append(tile[:, :, :, :tile_latent_stride_height, :tile_latent_stride_width])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        enc = torch.cat(result_rows, dim=3)[:, :, :, :latent_height, :latent_width]
+        return enc
+    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        _, _, num_frames, height, width = z.shape
+        sample_height = height * self.spatial_compression_ratio
+        sample_width = width * self.spatial_compression_ratio
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
+        tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
+        blend_height = self.tile_sample_min_height - self.tile_sample_stride_height
+        blend_width = self.tile_sample_min_width - self.tile_sample_stride_width
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, tile_latent_stride_height):
+            row = []
+            for j in range(0, width, tile_latent_stride_width):
+                self.clear_cache()
+                time = []
+                for k in range(num_frames):
+                    self._conv_idx = [0]
+                    tile = z[:, :, k : k + 1, i : i + tile_latent_min_height, j : j + tile_latent_min_width]
+                    tile = self.post_quant_conv(tile)
+                    decoded = self.decoder(tile, feat_cache=self._feat_map, feat_idx=self._conv_idx)
+                    time.append(decoded)
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+        self.clear_cache()
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_width)
+                result_row.append(tile[:, :, :, : self.tile_sample_stride_height, : self.tile_sample_stride_width])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        dec = torch.cat(result_rows, dim=3)[:, :, :, :sample_height, :sample_width]
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        """
+        Args:
+            sample (`torch.Tensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z, return_dict=return_dict)
+        return dec

models/reconstruction_model.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+import numpy as np
+from utils import zero_init, EMANorm, create_rays
+import einops
+from .render import gaussian_render
+from utils import quaternion_to_matrix
+def inverse_sigmoid(x):
+    if type(x) == torch.Tensor:
+        return torch.log(x/(1-x))
+    else:
+        return math.log(x/(1-x))
+def inverse_softplus(x, beta=1):
+    if type(x) == torch.Tensor:
+        return (torch.exp(beta * x) - 1).log() / beta
+    else:
+        return math.log((math.exp(beta * x) - 1)) / beta
+import copy
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+from .autoencoder_kl_wan import WanCausalConv3d, WanRMS_norm, unpatchify
+class WANDecoderPixelAligned3DGSReconstructionModel(nn.Module):
+    def __init__(self,
+                 vae_model,
+                 feat_dim,
+                #  num_remove_decoder_up_blocks=0,
+                #  num_points_per_pixel=4,
+                 use_network_checkpointing=True,
+                 use_render_checkpointing=True
+        ):
+        super().__init__()
+        self.decoder = copy.deepcopy(vae_model.decoder).requires_grad_(True)
+        self.post_quant_conv = copy.deepcopy(vae_model.post_quant_conv).requires_grad_(True)
+        self.extra_conv_in = WanCausalConv3d(feat_dim, self.decoder.conv_in.weight.shape[0], 3, padding=1)
+        time_pad = self.extra_conv_in._padding[4]
+        self.extra_conv_in.padding = (0, self.extra_conv_in._padding[2], self.extra_conv_in._padding[0])
+        self.extra_conv_in._padding = (0, 0, 0, 0, 0, 0)
+        self.extra_conv_in.weight = torch.nn.Parameter(self.extra_conv_in.weight[:, :, time_pad:].clone())
+        with torch.no_grad():
+            self.extra_conv_in.weight.data.zero_()
+            self.extra_conv_in.bias.data.zero_()
+        # remove one block
+        # self.decoder.up_blocks = self.decoder.up_blocks[:-1]
+        dims = [self.decoder.dim * u for u in [self.decoder.dim_mult[-1]] + self.decoder.dim_mult[::-1]]
+        # self.decoder.up_blocks[-1].upsampler.mode = None
+        # self.decoder.up_blocks[-1].upsampler.resample = nn.Identity()
+        # self.decoder.up_blocks[-1].avg_shortcut = None
+        self.decoder.norm_out = WanRMS_norm(dims[-1], images=False, bias=False)
+        self.decoder.conv_out = nn.Identity()
+        # add ema_norm for vae
+        # for i_level in reversed(range(len(self.decoder.up_blocks))):
+        #     if self.decoder.up_blocks[i_level].upsampler is not None:
+        #         self.decoder.up_blocks[i_level].upsampler.resample = nn.Sequential(
+        #             self.decoder.up_blocks[i_level].upsampler.resample,
+        #         )
+        self.patch_size = vae_model.config.patch_size
+        # assert dims[-1] % 4 == 0
+        self.gs_head = PixelAligned3DGS(dims[-1], num_points_per_pixel=2)
+        del self.decoder.up_blocks[0].upsampler.time_conv
+        del self.decoder.up_blocks[1].upsampler.time_conv
+        self.decoder.conv_out = nn.Identity()
+        self.network_checkpointing = use_network_checkpointing
+        self.render_checkpointing = use_render_checkpointing
+    def decode(self, feats, z):
+        ## conv1
+        x = self.decoder.conv_in(self.post_quant_conv(z)) + self.extra_conv_in(feats)
+        ## middle
+        if self.network_checkpointing and torch.is_grad_enabled():
+            x = torch.utils.checkpoint.checkpoint(self.decoder.mid_block, x, None, [0], use_reentrant=False)
+        else:
+            x = self.decoder.mid_block(x, None, [0])
+        ## upsamples
+        for i, up_block in enumerate(self.decoder.up_blocks):
+            if self.network_checkpointing and torch.is_grad_enabled():
+                x = torch.utils.checkpoint.checkpoint(up_block, x, None, [0], True, use_reentrant=False)
+            else:
+                x = up_block(x, None, [0], first_chunk=True)
+        # head
+        x = self.decoder.norm_out(x)
+        x = self.decoder.nonlinearity(x)
+        x = self.decoder.conv_out(x)
+        # if self.patch_size is not None:
+        #     x = unpatchify(x, patch_size=self.patch_size)
+        return x
+    def forward(self, feats, z, cameras):
+        x = self.decode(feats, z).squeeze(2)
+        gaussian_params = self.gs_head(x, cameras.flatten(0, 1)).unflatten(0, (cameras.shape[0], cameras.shape[1]))
+        return gaussian_params
+    # def forward(self, images, cameras, scene_chunk_lens):
+    #     x, z, feats = self.encode(images)
+    #     return self.reconstruct(x, z, feats, cameras, scene_chunk_lens)
+    @torch.amp.autocast(device_type='cuda', enabled=False)
+    def render(self, gaussian_params, camerass, height, width, bg_mode='random'):
+        camerass = camerass.to(torch.float32)
+        test_c2ws = torch.eye(4, device=camerass.device)[None][None].repeat(camerass.shape[0], camerass.shape[1], 1, 1).float()
+        test_c2ws[:, :, :3, :3] = quaternion_to_matrix(camerass[:, :, :4])
+        test_c2ws[:, :, :3, 3] = camerass[:, :, 4:7]
+        test_intr = torch.eye(3, device=camerass.device)[None, None].repeat(camerass.shape[0], camerass.shape[1], 1, 1).float()
+        fx, fy, cx, cy = camerass[:, :, 7:11].split([1, 1, 1, 1], dim=-1)
+        test_intr = torch.cat([fx * width, fy * height, cx * width, cy * height], dim=-1)
+        return gaussian_render(gaussian_params, test_c2ws, test_intr, width, height, use_checkpoint=self.render_checkpointing, sh_degree=self.gs_head.sh_degree, bg_mode=bg_mode)
+from torch.autograd import Function
+class _trunc_exp(Function):
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return torch.exp(x)
+    @staticmethod
+    def backward(ctx, g):
+        x = ctx.saved_tensors[0]
+        return g * torch.exp(x.clamp(-10, 10))
+trunc_exp = _trunc_exp.apply
+class PixelAligned3DGS(nn.Module):
+    def __init__(
+            self,
+            embed_dim,
+            sh_degree=2,
+            use_mask=False,
+            scale_range=(0, 16), # related to pixel size
+            num_points_per_pixel=1,
+        ):
+        super().__init__()
+        self.sh_degree = sh_degree
+        # sh, uv_offset, depth, opacity, scales, rotations
+        # TODO: handle different sh_degree
+        self.gaussian_channels = [3 * (self.sh_degree + 1) ** 2, 2, 1, 1, 3, 4, (1 if use_mask else 0)]
+        self.gs_proj = nn.Conv2d(embed_dim, num_points_per_pixel * sum(self.gaussian_channels), 3, 1, 1)
+        self.register_buffer("lrs_mul", torch.Tensor(
+                [1] * 3 + # sh 0
+                [0.5] * 3 * ((self.sh_degree + 1) ** 2 - 1) + # other sh
+                [0.01] * 2 + # uv_offset
+                [1] * 1 + # depth
+                [1] * 1 + # opacity
+                [1] * 3 + # scales
+                [1] * 4 + # rotations
+                [0.1] * (1 if use_mask else 0) #  mask
+            ).repeat(num_points_per_pixel), persistent=True)
+        self.lrs_mul = self.lrs_mul / self.lrs_mul.max()
+        self.use_mask = use_mask
+        self.scale_range = scale_range
+        with torch.no_grad():
+            self.gs_proj.weight.data.zero_()
+            self.gs_proj.bias = nn.Parameter(torch.Tensor(
+                [0.0] * 3 * (self.sh_degree + 1) ** 2 + # sh
+                [0.0] * 2 + # uv_offset
+                [math.log(1)] * 1 + # depth
+                # [inverse_softplus(1)] * 1 + # depth
+                [inverse_sigmoid(0.1)] * 1 + # opacity
+                [inverse_sigmoid((1 - scale_range[0]) / (scale_range[1] - scale_range[0]))] * 3 + # scales (default: 1 hence the gaussian scale is equal to pixel size)
+                # [inverse_softplus(0.005)] * 3 + # scales (default: 1 hence the gaussian scale is equal to pixel size)
+                [1., 0, 0, 0] + # rotations
+                [inverse_sigmoid(0.9)] * (1 if use_mask else 0) #  mask (default: 0.9)
+            ).repeat(num_points_per_pixel) / self.lrs_mul)
+        self.num_points_per_pixel = num_points_per_pixel
+    @torch.amp.autocast(device_type='cuda', enabled=False)
+    def forward(self, x, cameras):
+        x = x.to(torch.float32)
+        cameras = cameras.to(torch.float32)
+        BN, _, h, w = x.shape
+        local_gaussian_params = F.conv2d(x, self.gs_proj.weight * self.lrs_mul[:, None, None, None], self.gs_proj.bias * self.lrs_mul, stride=1, padding=1).unflatten(1, (self.num_points_per_pixel, -1))
+        # local_gaussian_params = F.conv2d(x, self.gs_proj.weight, self.gs_proj.bias, stride=1, padding=1).unflatten(1, (self.num_points_per_pixel, -1))
+        # batch * n_frame, num_points_per_pixel, c, h, w -> batch * n_frame, num_points_per_pixel, h, w, c
+        local_gaussian_params = local_gaussian_params.permute(0, 1, 3, 4, 2)
+        features, uv_offset, depth, opacity, scales, rotations, mask = local_gaussian_params.split(self.gaussian_channels, dim=-1)
+        rays_o, rays_d = create_rays(cameras[:, None].repeat(1, self.num_points_per_pixel, 1), uv_offset=uv_offset, h=h, w=w)
+        depth = trunc_exp(depth)
+        # depth = F.softplus(depth, beta=1)
+        xyz = (rays_o + depth * rays_d)
+        # features = features.unflatten(-1, (-1, 3))
+        opacity = torch.sigmoid(opacity)
+        if self.use_mask:
+            if torch.is_grad_enabled():
+                mask = torch.sigmoid(mask)
+                hard_mask = (mask > torch.rand_like(mask)).float()
+                opacity = opacity * (mask + (hard_mask - mask).detach())
+            else:
+                mask = torch.sigmoid(mask)
+                hard_mask = (mask > torch.rand_like(mask)).float()
+                opacity = opacity * hard_mask
+        fx, fy = cameras[:, 7:9].split([1, 1], dim=-1)
+        fx, fy = fx / w, fy / h
+        pixel_size = torch.sqrt(fx.pow(2) + fy.pow(2))[:, None, None, None] * depth
+        scales = (torch.sigmoid(scales) * (self.scale_range[1] - self.scale_range[0]) + self.scale_range[0]) * pixel_size
+        # scales = F.softplus(scales, beta=1)
+        # It’s not required to be normalized for gspalt rasterization?
+        rotations = torch.nn.functional.normalize(rotations, dim=-1)
+        gaussian_params = torch.cat([xyz, opacity, scales, rotations, features], dim=-1)
+        return gaussian_params

models/render.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import os
+import time
+from xml.dom.minidom import Notation
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from gsplat import rasterization
+# torch.backends.cuda.preferred_linalg_library(backend="magma")
+""""
+modified from https://github.com/arthurhero/Long-LRM/blob/main/model/llrm.py
+"""
+class GaussianRendererWithCheckpoint(torch.autograd.Function):
+    @staticmethod
+    def render(xyz, feature, scale, rotation, opacity, test_c2w, test_intr,
+               W, H, sh_degree, near_plane, far_plane, backgrounds):
+        test_w2c = test_c2w.float().inverse().unsqueeze(0) # (1, 4, 4)
+        test_intr_i = torch.zeros(3, 3).to(test_intr.device)
+        test_intr_i[0, 0] = test_intr[0]
+        test_intr_i[1, 1] = test_intr[1]
+        test_intr_i[0, 2] = test_intr[2]
+        test_intr_i[1, 2] = test_intr[3]
+        test_intr_i[2, 2] = 1
+        test_intr_i = test_intr_i.unsqueeze(0) # (1, 3, 3)
+        rendering, alpha, _ = rasterization(xyz, rotation, scale, opacity, feature,
+                                        test_w2c, test_intr_i, W, H, sh_degree=sh_degree,
+                                        near_plane=near_plane, far_plane=far_plane,
+                                        render_mode="RGB+D",
+                                        backgrounds=backgrounds[None],
+                                        rasterize_mode='classic') # (1, H, W, 4)
+        # rendering[..., 3:] = rendering[..., 3:] + far_plane * (1 - alpha)
+        return rendering
+    @staticmethod
+    def forward(ctx, xyz, feature, scale, rotation, opacity, test_c2ws, test_intr,
+                W, H, sh_degree, near_plane, far_plane, backgrounds):
+        ctx.save_for_backward(xyz, feature, scale, rotation, opacity, test_c2ws, test_intr, backgrounds)
+        ctx.W = W
+        ctx.H = H
+        ctx.sh_degree = sh_degree
+        ctx.near_plane = near_plane
+        ctx.far_plane = far_plane
+        with torch.no_grad():
+            V, _ = test_intr.shape
+            renderings = torch.zeros(V, H, W, 4).to(xyz.device)
+            alphas = torch.rand(V, device=xyz.device)
+            for iv in range(V):
+                rendering = GaussianRendererWithCheckpoint.render(xyz, feature, scale, rotation, opacity,
+                                                                      test_c2ws[iv], test_intr[iv], W, H, sh_degree, near_plane, far_plane, backgrounds[iv])
+                renderings[iv:iv+1] = rendering
+        renderings = renderings.requires_grad_()
+        return renderings
+    @staticmethod
+    def backward(ctx, grad_output):
+        xyz, feature, scale, rotation, opacity, test_c2ws, test_intr, backgrounds = ctx.saved_tensors
+        xyz = xyz.detach().requires_grad_()
+        feature = feature.detach().requires_grad_()
+        scale = scale.detach().requires_grad_()
+        rotation = rotation.detach().requires_grad_()
+        opacity = opacity.detach().requires_grad_()
+        W = ctx.W
+        H = ctx.H
+        sh_degree = ctx.sh_degree
+        near_plane = ctx.near_plane
+        far_plane = ctx.far_plane
+        with torch.enable_grad():
+            V, _ = test_intr.shape
+            for iv in range(V):
+                rendering = GaussianRendererWithCheckpoint.render(xyz, feature, scale, rotation, opacity,
+                                                        test_c2ws[iv], test_intr[iv], W, H, sh_degree, near_plane, far_plane, backgrounds[iv])
+                rendering.backward(grad_output[iv:iv+1])
+        return xyz.grad, feature.grad, scale.grad, rotation.grad, opacity.grad, None, None, None, None, None, None, None, None
+def gaussian_render(gaussian_params, test_c2ws, test_intr, W, H, near_plane=0.01, far_plane=1000, use_checkpoint=False, sh_degree=0, bg_mode='random'):
+    if not torch.is_grad_enabled():
+        use_checkpoint = False
+     # opengl2colmap, see https://github.com/imlixinyang/Director3D/blob/main/modules/renderers/gaussians_renderer.py
+    test_c2ws[:, :, :3, 1:3] *= -1
+    device = test_intr.device
+    B, V, _ = test_intr.shape
+    renderings = []
+    for ib in range(B):
+        if bg_mode == 'random':
+            backgrounds = torch.rand(V, 3).to(device)
+        elif bg_mode == 'white':
+            backgrounds = torch.ones(V, 3).to(device)
+        elif bg_mode == 'black':
+            backgrounds = torch.zeros(V, 3).to(device)
+        else:
+            raise ValueError(f"Invalid background mode: {bg_mode}")
+        xyz_i, opacity_i, scale_i, rotation_i, feature_i = gaussian_params[ib].float().split([3, 1, 3, 4, (sh_degree + 1)**2 * 3], dim=-1)
+        opacity_i = opacity_i.squeeze(-1)
+        feature_i = feature_i.reshape(-1, (sh_degree + 1)**2, 3)
+        if use_checkpoint:
+            renderings.append(GaussianRendererWithCheckpoint.apply(xyz_i, feature_i, scale_i, rotation_i, opacity_i, test_c2ws[ib], test_intr[ib], W, H, sh_degree, near_plane, far_plane, backgrounds))
+        else:
+            rendering = torch.zeros(V, H, W, 4).to(device)
+            for iv in range(V):
+                rendering[iv:iv+1] = GaussianRendererWithCheckpoint.render(xyz_i, feature_i, scale_i, rotation_i, opacity_i,
+                                                                      test_c2ws[ib][iv], test_intr[ib][iv], W, H, sh_degree, near_plane, far_plane, backgrounds[iv])
+            # test_w2c_i = test_c2ws[ib].float().inverse() # (V, 4, 4)
+            # test_intr_i = torch.zeros(V, 3, 3).to(device)
+            # test_intr_i[:, 0, 0] = test_intr[ib, :, 0]
+            # test_intr_i[:, 1, 1] = test_intr[ib, :, 1]
+            # test_intr_i[:, 0, 2] = test_intr[ib, :, 2]
+            # test_intr_i[:, 1, 2] = test_intr[ib, :, 3]
+            # test_intr_i[:, 2, 2] = 1
+            # # print(backgrounds.shape)
+            # rendering, _, _ = rasterization(xyz_i, rotation_i, scale_i, opacity_i, feature_i,
+            #                                     test_w2c_i, test_intr_i, W, H, sh_degree=sh_degree,
+            #                                     near_plane=near_plane, far_plane=far_plane,
+            #                                     render_mode="RGB+D",
+            #                                     backgrounds=backgrounds,
+            #                                     rasterize_mode='classic') # (V, H, W, 3)
+            renderings.append(rendering)
+    renderings = torch.stack(renderings, dim=0).permute(0, 1, 4, 2, 3).contiguous() # (B, 3, V, H, W)
+    rgb = renderings[:, :, :3].mul_(2).add_(-1).clamp(-1, 1)
+    depth = renderings[:, :, 3:]
+    return rgb, depth

models/transformer_wan.py ADDED Viewed

	@@ -0,0 +1,601 @@

+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import Attention
+from diffusers.models.cache_utils import CacheMixin
+from diffusers.models.embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps, get_1d_rotary_pos_embed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+try:
+    from sageattention import sageattn
+except ImportError:
+    sageattn = None
+class FP32LayerNorm(nn.LayerNorm):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        return F.layer_norm(
+            inputs,
+            self.normalized_shape,
+            self.weight if self.weight is not None else None,
+            self.bias if self.bias is not None else None,
+            self.eps,
+        ).to(inputs.dtype)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class WanAttnProcessor2_0:
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("WanAttnProcessor2_0 requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        encoder_hidden_states_img = None
+        if attn.add_k_proj is not None:
+            # 512 is the context length of the text encoder, hardcoded for now
+            image_context_length = encoder_hidden_states.shape[1] - 512
+            encoder_hidden_states_img = encoder_hidden_states[:, :image_context_length]
+            encoder_hidden_states = encoder_hidden_states[:, image_context_length:]
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query).to(hidden_states.dtype)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key).to(hidden_states.dtype)
+        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        if rotary_emb is not None:
+            def apply_rotary_emb(
+                hidden_states: torch.Tensor,
+                freqs_cos: torch.Tensor,
+                freqs_sin: torch.Tensor,
+            ):
+                x = hidden_states.view(*hidden_states.shape[:-1], -1, 2)
+                x1, x2 = x[..., 0], x[..., 1]
+                cos = freqs_cos[..., 0::2]
+                sin = freqs_sin[..., 1::2]
+                out = torch.empty_like(hidden_states)
+                out[..., 0::2] = x1 * cos - x2 * sin
+                out[..., 1::2] = x1 * sin + x2 * cos
+                return out.type_as(hidden_states)
+            query = apply_rotary_emb(query, *rotary_emb)
+            key = apply_rotary_emb(key, *rotary_emb)
+        # I2V task
+        hidden_states_img = None
+        if encoder_hidden_states_img is not None:
+            key_img = attn.add_k_proj(encoder_hidden_states_img)
+            key_img = attn.norm_added_k(key_img)
+            value_img = attn.add_v_proj(encoder_hidden_states_img)
+            key_img = key_img.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            value_img = value_img.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            if sageattn is not None:
+                # Ensure kernels receive fp16/bf16 tensors under autocast
+                if torch.is_autocast_enabled() and query.dtype not in (torch.float16, torch.bfloat16):
+                    target_dtype = torch.bfloat16
+                    query = query.to(target_dtype)
+                    key_img = key_img.to(target_dtype)
+                    value_img = value_img.to(target_dtype)
+                hidden_states_img = sageattn(
+                    query, key_img, value_img, attn_mask=None, dropout_p=0.0, is_causal=False
+                )
+            else:
+                hidden_states_img = F.scaled_dot_product_attention(
+                    query, key_img, value_img, attn_mask=None, dropout_p=0.0, is_causal=False
+                )
+            hidden_states_img = hidden_states_img.transpose(1, 2).flatten(2, 3)
+            hidden_states_img = hidden_states_img.type_as(query)
+        if sageattn is not None:
+            # print(query.dtype)
+            # Ensure kernels receive fp16/bf16 tensors under autocast
+            if torch.is_autocast_enabled() and query.dtype not in (torch.float16, torch.bfloat16):
+                target_dtype = torch.bfloat16
+                query = query.to(target_dtype)
+                key = key.to(target_dtype)
+                value = value.to(target_dtype)
+            hidden_states = sageattn(
+                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            )
+        else:
+            hidden_states = F.scaled_dot_product_attention(
+                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            )
+        hidden_states = hidden_states.transpose(1, 2).flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+        if hidden_states_img is not None:
+            hidden_states = hidden_states + hidden_states_img
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class WanImageEmbedding(torch.nn.Module):
+    def __init__(self, in_features: int, out_features: int, pos_embed_seq_len=None):
+        super().__init__()
+        self.norm1 = FP32LayerNorm(in_features)
+        self.ff = FeedForward(in_features, out_features, mult=1, activation_fn="gelu")
+        self.norm2 = FP32LayerNorm(out_features)
+        if pos_embed_seq_len is not None:
+            self.pos_embed = nn.Parameter(torch.zeros(1, pos_embed_seq_len, in_features))
+        else:
+            self.pos_embed = None
+    def forward(self, encoder_hidden_states_image: torch.Tensor) -> torch.Tensor:
+        if self.pos_embed is not None:
+            batch_size, seq_len, embed_dim = encoder_hidden_states_image.shape
+            encoder_hidden_states_image = encoder_hidden_states_image.view(-1, 2 * seq_len, embed_dim)
+            encoder_hidden_states_image = encoder_hidden_states_image + self.pos_embed
+        hidden_states = self.norm1(encoder_hidden_states_image)
+        hidden_states = self.ff(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        return hidden_states
+class WanTimeTextImageEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        time_freq_dim: int,
+        time_proj_dim: int,
+        text_embed_dim: int,
+        image_embed_dim: Optional[int] = None,
+        pos_embed_seq_len: Optional[int] = None,
+    ):
+        super().__init__()
+        self.timesteps_proj = Timesteps(num_channels=time_freq_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.time_embedder = TimestepEmbedding(in_channels=time_freq_dim, time_embed_dim=dim)
+        self.act_fn = nn.SiLU()
+        self.time_proj = nn.Linear(dim, time_proj_dim)
+        self.text_embedder = PixArtAlphaTextProjection(text_embed_dim, dim, act_fn="gelu_tanh")
+        self.image_embedder = None
+        if image_embed_dim is not None:
+            self.image_embedder = WanImageEmbedding(image_embed_dim, dim, pos_embed_seq_len=pos_embed_seq_len)
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        timestep_seq_len: Optional[int] = None,
+    ):
+        timestep = self.timesteps_proj(timestep)
+        if timestep_seq_len is not None:
+            timestep = timestep.unflatten(0, (1, timestep_seq_len))
+        time_embedder_dtype = next(iter(self.time_embedder.parameters())).dtype
+        if timestep.dtype != time_embedder_dtype and time_embedder_dtype != torch.int8:
+            timestep = timestep.to(time_embedder_dtype)
+        temb = self.time_embedder(timestep).type_as(encoder_hidden_states)
+        timestep_proj = self.time_proj(self.act_fn(temb))
+        encoder_hidden_states = self.text_embedder(encoder_hidden_states)
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states_image = self.image_embedder(encoder_hidden_states_image)
+        return temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image
+class WanRotaryPosEmbed(nn.Module):
+    def __init__(
+        self,
+        attention_head_dim: int,
+        patch_size: Tuple[int, int, int],
+        max_seq_len: int,
+        theta: float = 10000.0,
+    ):
+        super().__init__()
+        self.attention_head_dim = attention_head_dim
+        self.patch_size = patch_size
+        self.max_seq_len = max_seq_len
+        h_dim = w_dim = 2 * (attention_head_dim // 6)
+        t_dim = attention_head_dim - h_dim - w_dim
+        freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+        freqs_cos = []
+        freqs_sin = []
+        for dim in [t_dim, h_dim, w_dim]:
+            freq_cos, freq_sin = get_1d_rotary_pos_embed(
+                dim,
+                max_seq_len,
+                theta,
+                use_real=True,
+                repeat_interleave_real=True,
+                freqs_dtype=freqs_dtype,
+            )
+            freqs_cos.append(freq_cos)
+            freqs_sin.append(freq_sin)
+        self.register_buffer("freqs_cos", torch.cat(freqs_cos, dim=1), persistent=False)
+        self.register_buffer("freqs_sin", torch.cat(freqs_sin, dim=1), persistent=False)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.patch_size
+        ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
+        split_sizes = [
+            self.attention_head_dim - 2 * (self.attention_head_dim // 3),
+            self.attention_head_dim // 3,
+            self.attention_head_dim // 3,
+        ]
+        freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
+        freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
+        freqs_cos_f = freqs_cos[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_h = freqs_cos[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_w = freqs_cos[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_f = freqs_sin[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_h = freqs_sin[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_w = freqs_sin[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos = torch.cat([freqs_cos_f, freqs_cos_h, freqs_cos_w], dim=-1).reshape(1, 1, ppf * pph * ppw, -1)
+        freqs_sin = torch.cat([freqs_sin_f, freqs_sin_h, freqs_sin_w], dim=-1).reshape(1, 1, ppf * pph * ppw, -1)
+        return freqs_cos, freqs_sin
+@maybe_allow_in_graph
+class WanTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        ffn_dim: int,
+        num_heads: int,
+        qk_norm: str = "rms_norm_across_heads",
+        cross_attn_norm: bool = False,
+        eps: float = 1e-6,
+        added_kv_proj_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        # 1. Self-attention
+        self.norm1 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_heads,
+            kv_heads=num_heads,
+            dim_head=dim // num_heads,
+            qk_norm=qk_norm,
+            eps=eps,
+            bias=True,
+            cross_attention_dim=None,
+            out_bias=True,
+            processor=WanAttnProcessor2_0(),
+        )
+        # 2. Cross-attention
+        self.attn2 = Attention(
+            query_dim=dim,
+            heads=num_heads,
+            kv_heads=num_heads,
+            dim_head=dim // num_heads,
+            qk_norm=qk_norm,
+            eps=eps,
+            bias=True,
+            cross_attention_dim=None,
+            out_bias=True,
+            added_kv_proj_dim=added_kv_proj_dim,
+            added_proj_bias=True,
+            processor=WanAttnProcessor2_0(),
+        )
+        self.norm2 = FP32LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity()
+        # 3. Feed-forward
+        self.ffn = FeedForward(dim, inner_dim=ffn_dim, activation_fn="gelu-approximate")
+        self.norm3 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        rotary_emb: torch.Tensor,
+    ) -> torch.Tensor:
+        if temb.ndim == 4:
+            # temb: batch_size, seq_len, 6, inner_dim (wan2.2 ti2v)
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+                self.scale_shift_table.unsqueeze(0) + temb
+            ).chunk(6, dim=2)
+            # batch_size, seq_len, 1, inner_dim
+            shift_msa = shift_msa.squeeze(2)
+            scale_msa = scale_msa.squeeze(2)
+            gate_msa = gate_msa.squeeze(2)
+            c_shift_msa = c_shift_msa.squeeze(2)
+            c_scale_msa = c_scale_msa.squeeze(2)
+            c_gate_msa = c_gate_msa.squeeze(2)
+        else:
+            # temb: batch_size, 6, inner_dim (wan2.1/wan2.2 14B)
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+                self.scale_shift_table + temb
+            ).chunk(6, dim=1)
+        # print(hidden_states.dtype)
+        # 1. Self-attention
+        norm_hidden_states = (self.norm1(hidden_states).mul_(1 + scale_msa).add_(shift_msa))
+        attn_output = self.attn1(hidden_states=norm_hidden_states, rotary_emb=rotary_emb)
+        hidden_states += attn_output * gate_msa
+        # hidden_states = hidden_states.type_as(hidden_states)
+        # print(hidden_states.dtype)
+        # 2. Cross-attention
+        norm_hidden_states = self.norm2(hidden_states)
+        attn_output = self.attn2(hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+        hidden_states += attn_output
+        # print(hidden_states.dtype)
+        # 3. Feed-forward
+        norm_hidden_states = (self.norm3(hidden_states).mul_(1 + c_scale_msa).add_(c_shift_msa))
+        ff_output = self.ffn(norm_hidden_states)
+        hidden_states += ff_output.mul_(c_gate_msa)
+        # hidden_states = hidden_states.type_as(hidden_states)
+        return hidden_states
+class WanTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
+    r"""
+    A Transformer model for video-like data used in the Wan model.
+    Args:
+        patch_size (`Tuple[int]`, defaults to `(1, 2, 2)`):
+            3D patch dimensions for video embedding (t_patch, h_patch, w_patch).
+        num_attention_heads (`int`, defaults to `40`):
+            Fixed length for text embeddings.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, defaults to `16`):
+            The number of channels in the output.
+        text_dim (`int`, defaults to `512`):
+            Input dimension for text embeddings.
+        freq_dim (`int`, defaults to `256`):
+            Dimension for sinusoidal time embeddings.
+        ffn_dim (`int`, defaults to `13824`):
+            Intermediate dimension in feed-forward network.
+        num_layers (`int`, defaults to `40`):
+            The number of layers of transformer blocks to use.
+        window_size (`Tuple[int]`, defaults to `(-1, -1)`):
+            Window size for local attention (-1 indicates global attention).
+        cross_attn_norm (`bool`, defaults to `True`):
+            Enable cross-attention normalization.
+        qk_norm (`bool`, defaults to `True`):
+            Enable query/key normalization.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        add_img_emb (`bool`, defaults to `False`):
+            Whether to use img_emb.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+    """
+    _supports_gradient_checkpointing = True
+    _skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
+    _no_split_modules = ["WanTransformerBlock"]
+    _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
+    _keys_to_ignore_on_load_unexpected = ["norm_added_q"]
+    _repeated_blocks = ["WanTransformerBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: Tuple[int] = (1, 2, 2),
+        num_attention_heads: int = 40,
+        attention_head_dim: int = 128,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        text_dim: int = 4096,
+        freq_dim: int = 256,
+        ffn_dim: int = 13824,
+        num_layers: int = 40,
+        cross_attn_norm: bool = True,
+        qk_norm: Optional[str] = "rms_norm_across_heads",
+        eps: float = 1e-6,
+        image_dim: Optional[int] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        rope_max_seq_len: int = 1024,
+        pos_embed_seq_len: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        out_channels = out_channels or in_channels
+        # 1. Patch & position embedding
+        self.rope = WanRotaryPosEmbed(attention_head_dim, patch_size, rope_max_seq_len)
+        self.patch_embedding = nn.Conv3d(in_channels, inner_dim, kernel_size=patch_size, stride=patch_size)
+        # 2. Condition embeddings
+        # image_embedding_dim=1280 for I2V model
+        self.condition_embedder = WanTimeTextImageEmbedding(
+            dim=inner_dim,
+            time_freq_dim=freq_dim,
+            time_proj_dim=inner_dim * 6,
+            text_embed_dim=text_dim,
+            image_embed_dim=image_dim,
+            pos_embed_seq_len=pos_embed_seq_len,
+        )
+        # 3. Transformer blocks
+        self.blocks = nn.ModuleList(
+            [
+                WanTransformerBlock(
+                    inner_dim, ffn_dim, num_attention_heads, qk_norm, cross_attn_norm, eps, added_kv_proj_dim
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # 4. Output norm & projection
+        self.norm_out = FP32LayerNorm(inner_dim, eps, elementwise_affine=False)
+        self.proj_out = nn.Linear(inner_dim, out_channels * math.prod(patch_size))
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 2, inner_dim) / inner_dim**0.5)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.config.patch_size
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p_h
+        post_patch_width = width // p_w
+        rotary_emb = self.rope(hidden_states)
+        hidden_states = self.patch_embedding(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        # timestep shape: batch_size, or batch_size, seq_len (wan 2.2 ti2v)
+        if timestep.ndim == 2:
+            ts_seq_len = timestep.shape[1]
+            timestep = timestep.flatten()  # batch_size * seq_len
+        else:
+            ts_seq_len = None
+        temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+            timestep, encoder_hidden_states, encoder_hidden_states_image, timestep_seq_len=ts_seq_len
+        )
+        if ts_seq_len is not None:
+            # batch_size, seq_len, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(2, (6, -1))
+        else:
+            # batch_size, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(1, (6, -1))
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
+        if True:
+            encoder_hidden_states = encoder_hidden_states.to(torch.bfloat16)
+            timestep_proj = timestep_proj.to(torch.bfloat16)
+            rotary_emb = [rotary_emb[0].to(torch.bfloat16), rotary_emb[1].to(torch.bfloat16)]
+            hidden_states = hidden_states.to(torch.bfloat16)
+        # 4. Transformer blocks
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for block in self.blocks:
+                hidden_states = self._gradient_checkpointing_func(
+                    block, hidden_states, encoder_hidden_states, timestep_proj, rotary_emb
+                )
+        else:
+            for block in self.blocks:
+                hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+        # 5. Output norm, projection & unpatchify
+        if temb.ndim == 3:
+            # batch_size, seq_len, inner_dim (wan 2.2 ti2v)
+            shift, scale = (self.scale_shift_table.unsqueeze(0) + temb.unsqueeze(2)).chunk(2, dim=2)
+            shift = shift.squeeze(2)
+            scale = scale.squeeze(2)
+        else:
+            # batch_size, inner_dim
+            shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
+        # Move the shift and scale tensors to the same device as hidden_states.
+        # When using multi-GPU inference via accelerate these will be on the
+        # first device rather than the last device, which hidden_states ends up
+        # on.
+        shift = shift.to(hidden_states.device)
+        scale = scale.to(hidden_states.device)
+        hidden_states = (self.norm_out(hidden_states) * (1 + scale) + shift).type_as(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.reshape(
+            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, p_t, p_h, p_w, -1
+        )
+        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
+        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

quant.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import gc
+from typing import Tuple
+import copy
+import torch
+import tqdm
+def cleanup_memory():
+    gc.collect()
+    torch.cuda.empty_cache()
+def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
+    """Quantize a tensor using per-tensor static scaling factor.
+    Args:
+        tensor: The input tensor.
+    """
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    # Calculate the scale as dtype max divided by absmax.
+    # Since .abs() creates a new tensor, we use aminmax to get
+    # the min and max first and then calculate the absmax.
+    if tensor.numel() == 0:
+        # Deal with empty tensors (triggered by empty MoE experts)
+        min_val, max_val = (
+            torch.tensor(-16.0, dtype=tensor.dtype),
+            torch.tensor(16.0, dtype=tensor.dtype),
+        )
+    else:
+        min_val, max_val = tensor.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs())
+    scale = finfo.max / amax.clamp(min=1e-12)
+    # scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
+    # Return both float8 data and the inverse scale (as float),
+    # as both required as inputs to torch._scaled_mm
+    qweight = qweight.to(torch.float8_e4m3fn)
+    scale = scale.float().reciprocal()
+    return qweight, scale
+def static_per_tensor_quantize(tensor: torch.Tensor, inv_scale: float) -> torch.Tensor:
+    """Quantizes a floating-point tensor to FP8 (E4M3 format) using static scaling.
+    Performs uniform quantization of the input tensor by:
+    1. Scaling the tensor values using the provided inverse scale factor
+    2. Clamping values to the representable range of FP8 E4M3 format
+    3. Converting to FP8 data type
+    Args:
+        tensor (torch.Tensor): Input tensor to be quantized (any floating-point dtype)
+        inv_scale (float): Inverse of the quantization scale factor (1/scale)
+                         (Must be pre-calculated based on tensor statistics)
+    Returns:
+        torch.Tensor: Quantized tensor in torch.float8_e4m3fn format
+    Note:
+        - Uses the E4M3 format (4 exponent bits, 3 mantissa bits, no infinity/nan)
+        - This is a static quantization (scale factor must be pre-determined)
+        - For dynamic quantization, see per_tensor_quantize()
+    """
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
+    return qweight.to(torch.float8_e4m3fn)
+def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype, native_fp8_support=False):
+    """Performs FP8 GEMM (General Matrix Multiplication) operation with optional native hardware support.
+    Args:
+        A (torch.Tensor): Input tensor A (FP8 or other dtype)
+        A_scale (torch.Tensor/float): Scale factor for tensor A
+        B (torch.Tensor): Input tensor B (FP8 or other dtype)
+        B_scale (torch.Tensor/float): Scale factor for tensor B
+        bias (torch.Tensor/None): Optional bias tensor
+        out_dtype (torch.dtype): Output data type
+        native_fp8_support (bool): Whether to use hardware-accelerated FP8 operations
+    Returns:
+        torch.Tensor: Result of GEMM operation
+    """
+    if A.numel() == 0:
+        # Deal with empty tensors (triggeted by empty MoE experts)
+        return torch.empty(size=(0, B.shape[0]), dtype=out_dtype, device=A.device)
+    if native_fp8_support:
+        need_reshape = A.dim() == 3
+        if need_reshape:
+            batch_size = A.shape[0]
+            A_input = A.reshape(-1, A.shape[-1])
+        else:
+            batch_size = None
+            A_input = A
+        output = torch._scaled_mm(
+            A_input,
+            B.t(),
+            out_dtype=out_dtype,
+            scale_a=A_scale,
+            scale_b=B_scale,
+            bias=bias,
+        )
+        if need_reshape:
+            output = output.reshape(
+                batch_size, output.shape[0] // batch_size, output.shape[1]
+            )
+    else:
+        output = torch.nn.functional.linear(
+            A.to(out_dtype) * A_scale,
+            B.to(out_dtype) * B_scale.to(out_dtype),
+            bias=bias,
+        )
+    return output
+def replace_module(model: torch.nn.Module, name: str, new_module: torch.nn.Module):
+    if "." in name:
+        parent_name = name.rsplit(".", 1)[0]
+        child_name = name[len(parent_name) + 1:]
+        parent = model.get_submodule(parent_name)
+    else:
+        parent_name = ""
+        parent = model
+        child_name = name
+    setattr(parent, child_name, new_module)
+# Class responsible for quantizing weights
+class FP8DynamicLinear(torch.nn.Module):
+    def __init__(
+            self,
+            weight: torch.Tensor,
+            weight_scale: torch.Tensor,
+            bias: torch.nn.Parameter,
+            native_fp8_support: bool = False,
+            dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.weight = torch.nn.Parameter(weight, requires_grad=False)
+        self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+        self.bias = bias
+        self.native_fp8_support = native_fp8_support
+        self.dtype = dtype
+    # @torch.compile
+    def forward(self, x):
+        if x.dtype !=self.dtype:
+            x = x.to(self.dtype)
+        qinput, x_scale = per_tensor_quantize(x)
+        output = fp8_gemm(
+            A=qinput,
+            A_scale=x_scale,
+            B=self.weight,
+            B_scale=self.weight_scale,
+            bias=self.bias,
+            out_dtype=x.dtype,
+            native_fp8_support=self.native_fp8_support,
+        )
+        return output
+def FluxFp8GeMMProcessor(model: torch.nn.Module):
+    """Processes a PyTorch model to convert eligible Linear layers to FP8 precision.
+    This function performs the following operations:
+    1. Checks for native FP8 support on the current GPU
+    2. Identifies target Linear layers in transformer blocks
+    3. Quantizes weights to FP8 format
+    4. Replaces original Linear layers with FP8DynamicLinear versions
+    5. Performs memory cleanup
+    Args:
+        model (torch.nn.Module): The neural network model to be processed.
+                                Should contain transformer blocks with Linear layers.
+    """
+    native_fp8_support = (
+            torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0)
+    )
+    named_modules = list(model.named_modules())
+    for name, linear in tqdm.tqdm(named_modules, desc="Quantizing weights to fp8"):
+        if isinstance(linear, torch.nn.Linear) and "blocks" in name:
+            quant_weight, weight_scale = per_tensor_quantize(linear.weight)
+            bias = copy.deepcopy(linear.bias) if linear.bias is not None else None
+            quant_linear = FP8DynamicLinear(
+                weight=quant_weight,
+                weight_scale=weight_scale,
+                bias=bias,
+                native_fp8_support=native_fp8_support,
+                dtype=linear.weight.dtype
+            )
+            replace_module(model, name, quant_linear)
+            del linear.weight
+            del linear.bias
+            del linear
+    cleanup_memory()

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+torch==2.6.0
+torchvision==0.21.0
+triton==3.2.0
+transformers==4.57.0
+omegaconf==2.3.0
+ninja==1.13.0
+numpy==2.2.6
+einops==0.8.1
+moviepy==1.0.3
+opencv-python==4.12.0.88
+av==15.1.0
+plyfile==1.1.2
+ftfy==6.3.1
+flask==3.1.2
+gradio==5.49.1
+gsplat==1.5.2
+accelerate==1.10.1
+git+https://github.com/huggingface/diffusers.git@447e8322f76efea55d4769cd67c372edbf0715b8
+git+https://github.com/nerfstudio-project/gsplat.git@32f2a54d21c7ecb135320bb02b136b7407ae5712

utils.py ADDED Viewed

	@@ -0,0 +1,531 @@

+from io import BytesIO
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import importlib
+from plyfile import PlyData, PlyElement
+import copy
+class EmbedContainer(nn.Module):
+    def __init__(self, tensor):
+        super().__init__()
+        self.tensor = nn.Parameter(tensor)
+    def forward(self):
+        return self.tensor
+@torch.no_grad
+def zero_init(module):
+    if type(module) is torch.nn.Conv2d or type(module) is torch.nn.Linear:
+        module.weight.zero_()
+        module.bias.zero_()
+    return module
+def import_str(string):
+    # From https://github.com/CompVis/taming-transformers
+    module, cls = string.rsplit(".", 1)
+    return getattr(importlib.import_module(module, package=None), cls)
+"""
+from https://github.com/Kai-46/minFM/blob/main/utils/ema.py
+Exponential Moving Average (EMA) utilities for PyTorch models.
+This module provides utilities for maintaining and updating EMA models,
+which are commonly used to improve model stability and generalization
+in training deep neural networks. It supports both regular tensors and
+DTensors (from FSDP-wrapped models).
+"""
+class EMA_FSDP:
+    def __init__(self, fsdp_module: torch.nn.Module, decay: float = 0.999):
+        self.decay = decay
+        self.shadow = {}
+        self._init_shadow(fsdp_module)
+    @torch.no_grad()
+    def _init_shadow(self, fsdp_module):
+        # 判断是否是FSDP模型
+        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        if isinstance(fsdp_module, FSDP):
+            with FSDP.summon_full_params(fsdp_module, writeback=False):
+                for n, p in fsdp_module.module.named_parameters():
+                    self.shadow[n] = p.detach().clone().float().cpu()
+        else:
+            for n, p in fsdp_module.named_parameters():
+                self.shadow[n] = p.detach().clone().float().cpu()
+    @torch.no_grad()
+    def update(self, fsdp_module):
+        d = self.decay
+        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        if isinstance(fsdp_module, FSDP):
+            with FSDP.summon_full_params(fsdp_module, writeback=False):
+                for n, p in fsdp_module.module.named_parameters():
+                    self.shadow[n].mul_(d).add_(p.detach().float().cpu(), alpha=1. - d)
+        else:
+            for n, p in fsdp_module.named_parameters():
+                print(n, self.shadow[n])
+                self.shadow[n].mul_(d).add_(p.detach().float().cpu(), alpha=1. - d)
+    # Optional helpers ---------------------------------------------------
+    def state_dict(self):
+        return self.shadow            # picklable
+    def load_state_dict(self, sd):
+        self.shadow = {k: v.clone() for k, v in sd.items()}
+    def copy_to(self, fsdp_module):
+        # load EMA weights into an (unwrapped) copy of the generator
+        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        with FSDP.summon_full_params(fsdp_module, writeback=True):
+            for n, p in fsdp_module.module.named_parameters():
+                if n in self.shadow:
+                    p.data.copy_(self.shadow[n].to(p.dtype, device=p.device))
+def create_raymaps(cameras, h, w):
+    rays_o, rays_d = create_rays(cameras, h, w)
+    raymaps = torch.cat([rays_d, rays_o - (rays_o * rays_d).sum(dim=-1, keepdim=True) * rays_d], dim=-1)
+    return raymaps
+# def create_raymaps(cameras, h, w):
+#     rays_o, rays_d = create_rays(cameras, h, w)
+#     raymaps = torch.cat([rays_d, torch.cross(rays_d, rays_o, dim=-1)], dim=-1)
+#     return raymaps
+class EMANorm(nn.Module):
+    def __init__(self, beta):
+        super().__init__()
+        self.register_buffer('magnitude_ema', torch.ones([]))
+        self.beta = beta
+    def forward(self, x):
+        if self.training:
+            magnitude_cur = x.detach().to(torch.float32).square().mean()
+            self.magnitude_ema.copy_(magnitude_cur.lerp(self.magnitude_ema.to(torch.float32), self.beta))
+        input_gain = self.magnitude_ema.rsqrt()
+        x = x.mul(input_gain)
+        return x
+class TimestepEmbedding(nn.Module):
+    def __init__(self, dim, max_period=10000, time_factor: float = 1000.0, zero_weight: bool = True):
+        super().__init__()
+        self.max_period = max_period
+        self.time_factor = time_factor
+        self.dim = dim
+        if zero_weight:
+            self.weight = nn.Parameter(torch.zeros(dim))
+        else:
+            self.weight = None
+    def forward(self, t):
+        if self.weight is None:
+            return timestep_embedding(t, self.dim, self.max_period, self.time_factor)
+        else:
+            return timestep_embedding(t, self.dim, self.max_period, self.time_factor) * self.weight.unsqueeze(0)
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=True)
+def timestep_embedding(t, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+def quaternion_to_matrix(quaternions):
+    """
+    Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+# from https://pytorch3d.readthedocs.io/en/latest/_modules/pytorch3d/transforms/rotation_conversions.html#matrix_to_quaternion
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+    Args:
+        quaternions: Quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions)
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    if torch.is_grad_enabled():
+        ret[positive_mask] = torch.sqrt(x[positive_mask])
+    else:
+        ret = torch.where(positive_mask, torch.sqrt(x), ret)
+    return ret
+def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
+        matrix.reshape(batch_dim + (9,)), dim=-1
+    )
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+    # we produce the desired quaternion multiplied by each of r, i, j, k
+    quat_by_rijk = torch.stack(
+        [
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
+    # the candidate won't be picked.
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
+    # forall i; we pick the best-conditioned one (with the largest denominator)
+    indices = q_abs.argmax(dim=-1, keepdim=True)
+    expand_dims = list(batch_dim) + [1, 4]
+    gather_indices = indices.unsqueeze(-1).expand(expand_dims)
+    out = torch.gather(quat_candidates, -2, gather_indices).squeeze(-2)
+    return standardize_quaternion(out)
+@torch.amp.autocast(device_type="cuda", enabled=False)
+def normalize_cameras(cameras, return_meta=False, ref_w2c=None, T_norm=None, n_frame=None):
+    B, N = cameras.shape[:2]
+    c2ws = torch.zeros(B, N, 3, 4, device=cameras.device)
+    c2ws[..., :3, :3] = quaternion_to_matrix(cameras[..., 0:4])
+    c2ws[..., :, 3] = cameras[..., 4:7]
+    _c2ws = c2ws
+    ref_w2c = torch.inverse(matrix_to_square(_c2ws[:, :1])) if ref_w2c is None else ref_w2c
+    _c2ws = (ref_w2c.repeat(1, N, 1, 1) @ matrix_to_square(_c2ws))[..., :3, :]
+    if n_frame is not None:
+        T_norm = _c2ws[..., :n_frame, :3, 3].norm(dim=-1).max(dim=1)[0][..., None, None] if T_norm is None else T_norm
+    else:
+        T_norm = _c2ws[..., :3, 3].norm(dim=-1).max(dim=1)[0][..., None, None] if T_norm is None else T_norm
+    _c2ws[..., :3, 3] = _c2ws[..., :3, 3] / (T_norm + 1e-2)
+    R = matrix_to_quaternion(_c2ws[..., :3, :3])
+    T = _c2ws[..., :3, 3]
+    cameras = torch.cat([R.float(), T.float(), cameras[..., 7:]], dim=-1)
+    if return_meta:
+        return cameras, ref_w2c, T_norm
+    else:
+        return cameras
+def create_rays(cameras, h, w, uv_offset=None):
+    prefix_shape = cameras.shape[:-1]
+    cameras = cameras.flatten(0, -2)
+    device = cameras.device
+    N = cameras.shape[0]
+    c2w = torch.eye(4, device=device)[None].repeat(N, 1, 1)
+    c2w[:, :3, :3] = quaternion_to_matrix(cameras[:, :4])
+    c2w[:, :3, 3] = cameras[:, 4:7]
+    # fx, fy, cx, cy should be divided by original H, W
+    fx, fy, cx, cy = cameras[:, 7:].chunk(4, -1)
+    fx, cx = fx * w, cx * w
+    fy, cy = fy * h, cy * h
+    inds = torch.arange(0, h*w, device=device).expand(N, h*w)
+    i = inds % w + 0.5
+    j = torch.div(inds, w, rounding_mode='floor') + 0.5
+    u = i / cx + (uv_offset[..., 0].reshape(N, h*w) if uv_offset is not None else 0)
+    v = j / cy + (uv_offset[..., 1].reshape(N, h*w) if uv_offset is not None else 0)
+    zs = - torch.ones_like(i)
+    xs = - (u - 1) * cx / fx * zs
+    ys = (v - 1) * cy / fy * zs
+    directions = torch.stack((xs, ys, zs), dim=-1)
+    rays_d = F.normalize(directions @ c2w[:, :3, :3].transpose(-1, -2), dim=-1)
+    rays_o = c2w[..., :3, 3] # [B, 3]
+    rays_o = rays_o[..., None, :].expand_as(rays_d)
+    rays_o = rays_o.reshape(*prefix_shape, h, w, 3)
+    rays_d = rays_d.reshape(*prefix_shape, h, w, 3)
+    return rays_o, rays_d
+def matrix_to_square(mat):
+    l = len(mat.shape)
+    if l==3:
+        return torch.cat([mat, torch.tensor([0,0,0,1]).repeat(mat.shape[0],1,1).to(mat.device)],dim=1)
+    elif l==4:
+        return torch.cat([mat, torch.tensor([0,0,0,1]).repeat(mat.shape[0],mat.shape[1],1,1).to(mat.device)],dim=2)
+def export_ply_for_gaussians(path, gaussians, opacity_threshold=0.00, T_norm=None):
+    sh_degree = int(math.sqrt((gaussians.shape[-1] - sum([3, 1, 3, 4])) / 3 - 1))
+    xyz, opacity, scale, rotation, feature = gaussians.float().split([3, 1, 3, 4, (sh_degree + 1)**2 * 3], dim=-1)
+    means3D = xyz.contiguous().float()
+    opacity = opacity.contiguous().float()
+    scales = scale.contiguous().float()
+    rotations = rotation.contiguous().float()
+    shs = feature.contiguous().float() # [N, 1, 3]
+    # print(means3D.shape, opacity.shape, scales.shape, rotations.shape, shs.shape)
+    # prune by opacity
+    if opacity_threshold > 0:
+        mask = opacity[..., 0] >= opacity_threshold
+        means3D = means3D[mask]
+        opacity = opacity[mask]
+        scales = scales[mask]
+        rotations = rotations[mask]
+        shs = shs[mask]
+        print("Gaussian percentage: ", mask.float().mean())
+    if T_norm is not None:
+        means3D = means3D * T_norm.item()
+        scales = scales * T_norm.item()
+    # invert activation to make it compatible with the original ply format
+    opacity = torch.log(opacity/(1-opacity))
+    scales = torch.log(scales + 1e-8)
+    xyzs = means3D.detach() # .cpu().numpy()
+    f_dc = shs.detach().flatten(start_dim=1).contiguous() #.cpu().numpy()
+    opacities = opacity.detach() #.cpu().numpy()
+    scales = scales.detach() #.cpu().numpy()
+    rotations = rotations.detach() #.cpu().numpy()
+    l = ['x', 'y', 'z']
+    # All channels except the 3 DC
+    for i in range(f_dc.shape[1]):
+        l.append('f_dc_{}'.format(i))
+    l.append('opacity')
+    for i in range(scales.shape[1]):
+        l.append('scale_{}'.format(i))
+    for i in range(rotations.shape[1]):
+        l.append('rot_{}'.format(i))
+    dtype_full = [(attribute, 'f4') for attribute in l]
+    # 最优化方案：使用numpy的recarray直接创建
+    attributes = torch.cat((xyzs, f_dc, opacities, scales, rotations), dim=1).cpu().numpy()
+    # 使用recarray直接创建，避免循环和类型转换
+    elements = np.rec.fromarrays([attributes[:, i] for i in range(attributes.shape[1])], names=l, formats=['f4'] * len(l))
+    el = PlyElement.describe(elements, 'vertex')
+    print(path)
+    PlyData([el]).write(path)
+    # plydata = PlyData([el])
+    # vert = plydata["vertex"]
+    # sorted_indices = np.argsort(
+    #     -np.exp(vert["scale_0"] + vert["scale_1"] + vert["scale_2"])
+    #     / (1 + np.exp(-vert["opacity"]))
+    # )
+    # buffer = BytesIO()
+    # for idx in sorted_indices:
+    #     v = plydata["vertex"][idx]
+    #     position = np.array([v["x"], v["y"], v["z"]], dtype=np.float32)
+    #     scales = np.exp(
+    #         np.array(
+    #             [v["scale_0"], v["scale_1"], v["scale_2"]],
+    #             dtype=np.float32,
+    #         )
+    #     )
+    #     rot = np.array(
+    #         [v["rot_0"], v["rot_1"], v["rot_2"], v["rot_3"]],
+    #         dtype=np.float32,
+    #     )
+    #     SH_C0 = 0.28209479177387814
+    #     color = np.array(
+    #         [
+    #             0.5 + SH_C0 * v["f_dc_0"],
+    #             0.5 + SH_C0 * v["f_dc_1"],
+    #             0.5 + SH_C0 * v["f_dc_2"],
+    #             1 / (1 + np.exp(-v["opacity"])),
+    #         ]
+    #     )
+    #     buffer.write(position.tobytes())
+    #     buffer.write(scales.tobytes())
+    #     buffer.write((color * 255).clip(0, 255).astype(np.uint8).tobytes())
+    #     buffer.write(
+    #         ((rot / np.linalg.norm(rot)) * 128 + 128)
+    #         .clip(0, 255)
+    #         .astype(np.uint8)
+    #         .tobytes()
+    #     )
+    # with open(path + '.splat', "wb") as f:
+    #     f.write(buffer.getvalue())
+@torch.amp.autocast(device_type="cuda", enabled=False)
+def quaternion_slerp(
+    q0, q1, fraction, spin: int = 0, shortestpath: bool = True
+):
+    """Return spherical linear interpolation between two quaternions.
+    Args:
+        quat0: first quaternion
+        quat1: second quaternion
+        fraction: how much to interpolate between quat0 vs quat1 (if 0, closer to quat0; if 1, closer to quat1)
+        spin: how much of an additional spin to place on the interpolation
+        shortestpath: whether to return the short or long path to rotation
+    """
+    d = (q0 * q1).sum(-1)
+    if shortestpath:
+        # invert rotation
+        d[d < 0.0] = -d[d < 0.0]
+        q1[d < 0.0] = q1[d < 0.0]
+    _d = d.clamp(0, 1.0)
+    # theta = torch.arccos(d) * fraction
+    # q2 = q1 - q0 * d
+    # q2 = q2 / (q2.norm(dim=-1) + 1e-10)
+    # return torch.cos(theta) * q0 + torch.sin(theta) * q2
+    angle = torch.acos(_d) + spin * math.pi
+    isin = 1.0 / (torch.sin(angle)+ 1e-10)
+    q0_ = q0 * (torch.sin((1.0 - fraction) * angle) * isin)[..., None]
+    q1_ = q1 * (torch.sin(fraction * angle) * isin)[..., None]
+    q = q0_ + q1_
+    q[angle < 1e-5] = q0[angle < 1e-5]
+    # q[fraction < 1e-5] = q0[fraction < 1e-5]
+    # q[fraction > 1 - 1e-5] = q1[fraction > 1 - 1e-5]
+    # q[(d.abs() - 1).abs() < 1e-5] = q0[(d.abs() - 1).abs() < 1e-5]
+    return q
+def sample_from_two_pose(pose_a, pose_b, fraction, noise_strengths=[0, 0]):
+    """
+    Args:
+        pose_a: first pose
+        pose_b: second pose
+        fraction
+    """
+    quat_a = pose_a[..., :4]
+    quat_b = pose_b[..., :4]
+    dot = torch.sum(quat_a * quat_b, dim=-1, keepdim=True)
+    quat_b = torch.where(dot < 0, -quat_b, quat_b)
+    quaternion = quaternion_slerp(quat_a, quat_b, fraction)
+    quaternion = torch.nn.functional.normalize(quaternion + torch.randn_like(quaternion) * noise_strengths[0], dim=-1)
+    T = (1 - fraction)[:, None] * pose_a[..., 4:] + fraction[:, None] * pose_b[..., 4:]
+    T = T + torch.randn_like(T) * noise_strengths[1]
+    new_pose = pose_a.clone()
+    new_pose[..., :4] = quaternion
+    new_pose[..., 4:] = T
+    return new_pose
+def sample_from_dense_cameras(dense_cameras, t, noise_strengths=[0, 0, 0, 0]):
+    N, C = dense_cameras.shape
+    M = t.shape
+    left = torch.floor(t * (N-1)).long().clamp(0, N-2)
+    right = left + 1
+    fraction = t * (N-1) - left
+    a = torch.gather(dense_cameras, 0, left[..., None].repeat(1, C))
+    b = torch.gather(dense_cameras, 0, right[..., None].repeat(1, C))
+    new_pose = sample_from_two_pose(a[:, :7],
+                                    b[:, :7], fraction, noise_strengths=noise_strengths[:2])
+    new_ins = (1 - fraction)[:, None] * a[:, 7:] + fraction[:, None] * b[:, 7:]
+    return torch.cat([new_pose, new_ins], dim=1)