Spaces:

zimhe
/

SpatialDiffusion

Runtime error

App Files Files Community

zimhe commited on May 11, 2025

Commit

a521a3f

1 Parent(s): 5790b69

add scripts and examples

Browse files

Files changed (26) hide show

.gitattributes +2 -0
examples/examples.json +65 -0
scripts/__pycache__/analysis.cpython-312.pyc +0 -0
scripts/__pycache__/cubemap_dataset.cpython-310.pyc +0 -0
scripts/__pycache__/cubemap_dataset.cpython-312.pyc +0 -0
scripts/__pycache__/cubemap_diffusion_pipeline.cpython-312.pyc +0 -0
scripts/__pycache__/cubemap_unet.cpython-310.pyc +0 -0
scripts/__pycache__/cubemap_unet.cpython-312.pyc +0 -0
scripts/__pycache__/cubemap_unet_attention.cpython-310.pyc +0 -0
scripts/__pycache__/cubemap_unet_attention.cpython-312.pyc +0 -0
scripts/__pycache__/cubemap_unet_attention_processor.cpython-312.pyc +0 -0
scripts/__pycache__/cubemap_unet_attention_simple.cpython-312.pyc +0 -0
scripts/__pycache__/cubemap_vae.cpython-310.pyc +0 -0
scripts/__pycache__/cubemap_vae.cpython-312.pyc +0 -0
scripts/__pycache__/preprocess.cpython-310.pyc +0 -0
scripts/__pycache__/train_cubemap_args.cpython-310.pyc +0 -0
scripts/__pycache__/train_cubemap_args.cpython-312.pyc +0 -0
scripts/__pycache__/train_cubemap_unet.cpython-312.pyc +0 -0
scripts/__pycache__/utils.cpython-310.pyc +0 -0
scripts/__pycache__/utils.cpython-312.pyc +0 -0
scripts/cubemap_diffusion_pipeline.py +550 -0
scripts/cubemap_unet.py +79 -0
scripts/cubemap_unet_attention_processor.py +99 -0
scripts/cubemap_vae.py +177 -0
scripts/utils.py +576 -0
viewer.html +65 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

examples/examples.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+    "Mountain Fall": {
+        "img":"https://huggingface.co/zimhe/SpatialDiffusion/resolve/main/examples/mount-fall.jpg",
+        "global": "large wide angle landscape view of rocky mountains at Fall, water lake and forest at the mountain root, blue sky at dusk with scattered golden clouds",
+        "front": "view of towering mountains and rivers, with dim clear foggy sky",
+        "back": "low mountains, scattered and layered, bright golden cloudy sky",
+        "left": "layered mountain peaks with thin cyan and green and creamy color,bright cloudy sky",
+        "right": "green water blending with the sky, bright cloudy sky, soft light",
+        "top": "bright beiged sky color, thick scattered clouds, faded to background",
+        "bottom": "earthy ground with dry grass and scattered stones"
+    },
+    "Qianli": {
+        "img":"https://huggingface.co/zimhe/SpatialDiffusion/resolve/main/examples/qianli.png",
+        "global": "Chinese traditional landscape painting, aged beiged color scheme, thin and subtle warm colors, mountains with warm color bottom and with peaks tint thin cyan and green, bright cloudy sky with soft light highly detailed, masterpiece.",
+        "front": "view of towering mountains and rivers, with dim clear foggy sky",
+        "back": "low mountains, scattered and layered, bright golden cloudy sky",
+        "left": "layered mountain peaks with thin cyan and green and creamy color,bright cloudy sky",
+        "right": "green water blending with the sky, bright cloudy sky, soft light",
+        "top": "bright beiged sky color, thick scattered clouds, faded to background",
+        "bottom": "earthy ground with dry grass and scattered stones"
+    }
+    ,
+    "Office":{
+        "img":"examples/office.png",
+        "global": "A modernist open office interior by Louis Khan, low view angle, simple clean structures, spacious and expansive, natural light, concrete and warm wood material, view of the city, high quality, 8k resolution, photorealistic, ultra-detailed, Masterpiece.",
+        "front": "office interior with delicate workplace desks",
+        "back": "office area with many desks and chairs",
+        "left": "large glass partition with dark metal frames",
+        "right": "public office area with beautiful curvy structures",
+        "top": "clean white ceiling, linear continuous lights",
+        "bottom": "clean empty floor with view from above, clean gray gloss material"
+    },
+    "Forest":{
+        "img":"https://huggingface.co/zimhe/SpatialDiffusion/resolve/main/examples/forest.jpg",
+        "global": "within the forest with beautiful weaving trees,  a tiger inside the bush, small water flowing, very thick tree trunks.",
+        "front": "view of thriving forest, with branches covering the sky",
+        "back": "a small pathway leading to a small water stream, with trees on both sides",
+        "left": "short bushes and trees,",
+        "right": "Stone-made pathways, with green leaves and branches, bushes surrounding the bottom",
+        "top": "crossing branches and leaves, with a few rays of light peeking through",
+        "bottom": "earthy ground with grass and scattered stones and twigs, with a few leaves on the ground"
+    },
+    "Las-Meninas":{
+        "img":"https://huggingface.co/zimhe/SpatialDiffusion/resolve/main/examples/Las-meninas.png",
+        "global": "A 17th-century Spanish royal studio interior with baroque decor, soft natural light, a painter at work, a young princess in a white dress, maids of honor, and a calm dog. Classic, elegant, and atmospheric.",
+        "front": "The princess in the center with her maids, the painter beside a large canvas, and a dog lying on the floor. Bright, central focus.",
+        "back": "Back of the studio with walls full of paintings and dim shadows. Quiet and deep space.",
+        "left": "Side view of the large canvas and the painter, with soft light from the opposite side.",
+        "right": "Ladies-in-waiting from the side, detailed dresses, dog in profile, and portraits on the dark wall.",
+        "top": "Ceiling with wooden beams and chandeliers, lit softly.",
+        "bottom": "Floor with shadows, the princess's gown, and the resting dog."
+    },
+    "Van-Gogh":{
+        "img":"https://huggingface.co/zimhe/SpatialDiffusion/resolve/main/examples/vangoh.png",
+        "global": "A cozy bedroom in the style of Vincent van Gogh, The scene features rustic wooden furniture, a bed with a red pillow and yellow blanket,",
+        "front": "The Bedroom in Arles by  Vincent van Gogh",
+        "back": "wall with door and window, with a view of the outside",
+        "left": "wooden chairs with yellow cushions, a small wooden table with a water pitcher and a mirror above it,",
+        "right": "The walls are painted with a vibrant blue color, with paintings and decorations hanging slightly tilted",
+        "top": "a white ceiling with an old lighting, a window with green shutters letting in warm daylight",
+        "bottom": "The wooden floor has a textured, painterly look"
+    }
+}

scripts/__pycache__/analysis.cpython-312.pyc ADDED Viewed

Binary file (3.94 kB). View file

scripts/__pycache__/cubemap_dataset.cpython-310.pyc ADDED Viewed

Binary file (2.53 kB). View file

scripts/__pycache__/cubemap_dataset.cpython-312.pyc ADDED Viewed

Binary file (7.77 kB). View file

scripts/__pycache__/cubemap_diffusion_pipeline.cpython-312.pyc ADDED Viewed

Binary file (26.1 kB). View file

scripts/__pycache__/cubemap_unet.cpython-310.pyc ADDED Viewed

Binary file (3.33 kB). View file

scripts/__pycache__/cubemap_unet.cpython-312.pyc ADDED Viewed

Binary file (3.67 kB). View file

scripts/__pycache__/cubemap_unet_attention.cpython-310.pyc ADDED Viewed

Binary file (2.17 kB). View file

scripts/__pycache__/cubemap_unet_attention.cpython-312.pyc ADDED Viewed

Binary file (4.77 kB). View file

scripts/__pycache__/cubemap_unet_attention_processor.cpython-312.pyc ADDED Viewed

Binary file (3.99 kB). View file

scripts/__pycache__/cubemap_unet_attention_simple.cpython-312.pyc ADDED Viewed

Binary file (3.87 kB). View file

scripts/__pycache__/cubemap_vae.cpython-310.pyc ADDED Viewed

Binary file (6.38 kB). View file

scripts/__pycache__/cubemap_vae.cpython-312.pyc ADDED Viewed

Binary file (8.71 kB). View file

scripts/__pycache__/preprocess.cpython-310.pyc ADDED Viewed

Binary file (2.7 kB). View file

scripts/__pycache__/train_cubemap_args.cpython-310.pyc ADDED Viewed

Binary file (8.46 kB). View file

scripts/__pycache__/train_cubemap_args.cpython-312.pyc ADDED Viewed

Binary file (12 kB). View file

scripts/__pycache__/train_cubemap_unet.cpython-312.pyc ADDED Viewed

Binary file (31.7 kB). View file

scripts/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (553 Bytes). View file

scripts/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (26.9 kB). View file

scripts/cubemap_diffusion_pipeline.py ADDED Viewed

	@@ -0,0 +1,550 @@

+import torch
+from typing import Any, Callable, Dict, List, Optional, Union
+from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline,StableDiffusionPipelineOutput
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import retrieve_timesteps
+from diffusers.models import AsymmetricAutoencoderKL, AutoencoderKL, ImageProjection, UNet2DConditionModel
+from .utils import FACES,generate_cubemap_uv
+from diffusers import AutoPipelineForInpainting
+DEFAULT_VIEW_PROMPT = {
+    "front": "view to the front, looking forward",
+    "back": "view to the back, looking backward",
+    "left": "view to the left side, looking left",
+    "right": "view to the right side, looking right",
+    "top": "view to above, looking upward, ceiling or sky",
+    "bottom": "view to below, looking downward, floor or ground",
+}
+class CubemapDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
+    def __init__(self, vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, image_encoder = None, requires_safety_checker = True):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, image_encoder, requires_safety_checker)
+    def prepare_cubemap_mask_condition(self,width: int, height: int, batch_size=6) -> torch.Tensor:
+        # 初始化全部为 0 的 tensor，形状为 (6, 1, height, width)
+        mask = torch.ones(size=(batch_size, 1, height, width), dtype=torch.float32)
+        # 将第一张 mask 置为 1
+        mask[0] = 0
+        return mask
+    @torch.no_grad()
+    def __call__(
+        self,
+        global_prompt: str = None,
+        per_face_prompts: Optional[Dict[str, str]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: torch.Tensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to
+                be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch
+                tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the
+                expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the
+                expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but
+                if passing latents directly it is not encoded again.
+            mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
+                1)`, or `(H, W)`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
+                image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
+                with the same aspect ration of the image and contains all masked area, and then expand that area based
+                on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
+                resizing to the original image size for inpainting. This is useful when the masked area is small while
+                the image is large and contain information irrelevant for inpainting, such as background.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+        >>> from diffusers import StableDiffusionInpaintPipeline
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+        >>> init_image = download_image(img_url).resize((512, 512))
+        >>> mask_image = download_image(mask_url).resize((512, 512))
+        >>> pipe = StableDiffusionInpaintPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+        >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+        ```
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # # 1. Check inputs
+        # self.check_inputs(
+        #     prompt,
+        #     image,
+        #     mask_image,
+        #     height,
+        #     width,
+        #     strength,
+        #     callback_steps,
+        #     output_type,
+        #     negative_prompt,
+        #     prompt_embeds,
+        #     negative_prompt_embeds,
+        #     ip_adapter_image,
+        #     ip_adapter_image_embeds,
+        #     callback_on_step_end_tensor_inputs,
+        #     padding_mask_crop,
+        # )
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        batch_size=6
+        if global_prompt is None and per_face_prompts is None:
+             raise ValueError(
+                "Provide either `global prompt` or `per_face_prompts`, or both. Can't leave them both empty"
+            )
+        prompt=[]
+        # 2. Define call parameters
+        if global_prompt is not None and per_face_prompts is None:
+            prompt=[f"{global_prompt},{DEFAULT_VIEW_PROMPT[f]}" for f in FACES]
+        elif global_prompt is not None and per_face_prompts is not None:
+            for f in FACES:
+                if f in per_face_prompts.keys():
+                    prompt_text=f"{global_prompt},{per_face_prompts[f]}"
+                    prompt.append(prompt_text)
+                else:
+                    prompt.append(f"{global_prompt},{DEFAULT_VIEW_PROMPT[f]}")
+        negative_prompt_list = None
+        if negative_prompt is not None:
+            if isinstance(negative_prompt, list):
+                if len(negative_prompt) == batch_size:
+                    negative_prompt_list = negative_prompt
+                else:
+                    raise ValueError(
+                    f"Length of `negative_prompt` list ({len(negative_prompt)}) does not match `batch_size` ({batch_size})."
+                    )
+            else:
+                negative_prompt_list = [negative_prompt for _ in range(batch_size)]
+        device = self._execution_device
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt_list,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # 4. set timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps=num_inference_steps, strength=strength, device=device
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+        # 5. Preprocess mask and image
+        # if padding_mask_crop is not None:
+        #     crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+        #     resize_mode = "fill"
+        # else:
+        #     crops_coords = None
+        #     resize_mode = "default"
+        original_image = image
+        cond_image=self.image_processor.preprocess(image,height=height,width=width)
+        cond_image=cond_image.to(dtype=torch.float32)
+        empty_face=torch.zeros_like(cond_image,dtype=torch.float32)
+        init_image = [cond_image]+[empty_face for _ in range(5)]
+        #stack the condition image with empty faces to make a tensor stack of shape (6,3,H,W)
+        init_image=torch.stack(init_image,dim=0)
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+        latents_outputs = self.prepare_latents(
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=False
+        )
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+        mask_condition=self.prepare_cubemap_mask_condition(width,height)
+        masked_image=init_image
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask_condition,
+            masked_image,
+            batch_size,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+        uv_maps=generate_cubemap_uv(height//self.vae_scale_factor,width//self.vae_scale_factor)
+        uv_channels=torch.stack([uv_maps[face] for face in FACES]).to(dtype=prompt_embeds.dtype,device=device)
+        uv_channel_input= torch.cat([uv_channels] * 2) if self.do_classifier_free_guidance else uv_channels
+        # # 8. Check that sizes of mask, masked image and latents match
+        # if num_channels_unet == 9:
+        #     # default case for runwayml/stable-diffusion-inpainting
+        #     num_channels_mask = mask.shape[1]
+        #     num_channels_masked_image = masked_image_latents.shape[1]
+        #     if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+        #         raise ValueError(
+        #             f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+        #             f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+        #             f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+        #             f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+        #             " `pipeline.unet` or your `mask_image` or `image` input."
+        #         )
+        # elif num_channels_unet != 4:
+        #     raise ValueError(
+        #         f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+        #     )
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 9.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+        # 9.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        # 10. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                #print("Unet Channels:", num_channels_unet)
+                if num_channels_unet == 11:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents,uv_channel_input], dim=1)
+                    #print("Latent Input Shape:",latent_model_input.shape)
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        if not output_type == "latent":
+            condition_kwargs = {}
+            if isinstance(self.vae, AsymmetricAutoencoderKL):
+                init_image = init_image.to(device=device, dtype=masked_image_latents.dtype)
+                init_image_condition = init_image.clone()
+                init_image = self._encode_vae_image(init_image, generator=generator)
+                mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype)
+                condition_kwargs = {"image": init_image_condition, "mask": mask_condition}
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor, return_dict=False, generator=generator, **condition_kwargs
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        input_img=self.image_processor.postprocess(cond_image,output_type=output_type,do_denormalize=do_denormalize)[0]
+        image[0]=input_img
+        # if padding_mask_crop is not None:
+        #     image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

scripts/cubemap_unet.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+import torch.nn as nn
+from diffusers import UNet2DConditionModel
+from diffusers.models.attention import BasicTransformerBlock
+from .cubemap_unet_attention_processor import InflatedAttentionProcessor
+class CubemapUNet(UNet2DConditionModel):
+    def __init__(self, pretrained_unet, in_channels=11):
+        """
+        1. **先加载 `pretrained_unet` 的默认通道数**
+        2. **完全加载 UNet 预训练权重**
+        3. **扩展 `conv_in` 和 `conv_out`**
+        """
+         # 从 `pretrained_unet.config` 复制所有参数，并修改 `in_channels`
+        unet_config = {**pretrained_unet.config}  # 复制字典，防止修改原模型
+        super().__init__(**unet_config)  # 这里直接传入所有参数
+        # Step 2: 完全加载 `pretrained_unet` 的权重
+        print("开始加载预训练权重")
+        self.load_state_dict(pretrained_unet.state_dict(), strict=True)
+        print("✅ UNet 预训练权重加载成功！")
+        # Step 3: **扩展 `conv_in`通道为11
+        self._expand_conv_in(pretrained_unet, in_channels)
+        self.register_to_config(in_channels=in_channels)
+        cubemap_attn_processor=InflatedAttentionProcessor()
+        self._modify_attn_processor(cubemap_attn_processor)
+    def _expand_conv_in(self, pretrained_unet, new_in_channels):
+        """扩展 `conv_in` 以适应新的输入通道"""
+        old_conv_in = pretrained_unet.conv_in
+        old_weight = old_conv_in.weight  # [out_channels, 4, kernel, kernel]
+        # 创建新的 `conv_in`
+        new_conv_in = nn.Conv2d(
+            new_in_channels,
+            old_conv_in.out_channels,
+            kernel_size=old_conv_in.kernel_size,
+            stride=old_conv_in.stride,
+            padding=old_conv_in.padding
+        )
+        old_channels=old_weight.shape[1]
+        # 复制前 `4` 个通道的权重
+        new_weight = torch.zeros((new_conv_in.out_channels, new_in_channels, *old_conv_in.kernel_size))
+        new_weight[:, :old_channels, :, :] = old_weight  # 复制 4 通道
+        # 随机初始化新增通道
+        new_weight[:, old_channels:, :, :] = torch.randn_like(new_weight[:, old_channels:, :, :]) * 0.01
+        new_conv_in.weight = nn.Parameter(new_weight)
+        if old_conv_in.bias is not None:
+            new_conv_in.bias = nn.Parameter(old_conv_in.bias.clone())
+        self.conv_in = new_conv_in
+        print(f"✅ `conv_in` 扩展成功！新输入通道: {new_in_channels}")
+    def _modify_attn_processor(self,processor):
+        for name, module in self.named_modules():
+            if isinstance(module, BasicTransformerBlock):
+                if hasattr(module, 'attn1'):
+                    module.attn1.set_processor(processor)
+                if hasattr(module, 'attn2'):
+                    module.attn2.set_processor(processor)

scripts/cubemap_unet_attention_processor.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.models.attention_processor import Attention, AttnProcessor
+from flash_attn.flash_attn_interface import flash_attn_func
+class InflatedAttentionProcessor(AttnProcessor):
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        num_views: int = 6,  # 例如 CubeMap 有 6 个视角
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+            """
+            实现 CubeDiff 论文中的 Inflated Attention:
+            - 将输入 `B, N, C` 转换为 `B, F*N, C`
+            - 在 `F*N` 维度上进行 Self-Attention
+            """
+            residual = hidden_states
+            # 1️⃣ 预处理
+            if attn.spatial_norm is not None:
+                hidden_states = attn.spatial_norm(hidden_states, temb)
+            input_ndim = hidden_states.ndim
+            if input_ndim == 4:
+                batch_size, channel, height, width = hidden_states.shape
+                hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+            BXF, N, C = hidden_states.shape  # 原始注意力输入
+            # 2️⃣ **变换 `B, N, C → B, F*N, C`**
+            F = num_views
+            B=BXF//F
+            # 3️⃣ **标准 Attention 计算**
+            attention_mask = attn.prepare_attention_mask(attention_mask, hidden_states.shape[1], B)
+            if attn.group_norm is not None:
+                hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+            is_self_attn=False
+            if encoder_hidden_states is None:
+                hidden_states = hidden_states.view(B, F, N, C)
+                hidden_states = hidden_states.reshape(B, F * N, C)
+                encoder_hidden_states = hidden_states
+                is_self_attn=True
+            elif attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+            query = attn.to_q(hidden_states)
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+            query = attn.head_to_batch_dim(query,out_dim=4).permute(0,2,1,3)
+            key = attn.head_to_batch_dim(key,out_dim=4).permute(0,2,1,3)
+            value = attn.head_to_batch_dim(value,out_dim=4).permute(0,2,1,3)
+            hidden_states = flash_attn_func(query, key, value, dropout_p=0.0, causal=False)
+            B,L,H,D=hidden_states.shape
+            hidden_states = hidden_states.view(B,L,H*D)
+            # query = attn.head_to_batch_dim(query)
+            # key = attn.head_to_batch_dim(key)
+            # value = attn.head_to_batch_dim(value)
+            # attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            # hidden_states = torch.bmm(attention_probs, value)
+            # hidden_states = attn.batch_to_head_dim(hidden_states)
+            # 4️⃣ **线性投影 & Dropout**
+            hidden_states = attn.to_out[0](hidden_states)
+            hidden_states = attn.to_out[1](hidden_states)
+            if is_self_attn:
+                # 5️⃣ **还原形状 `B, F*N, C → B, N, C`**
+                hidden_states = hidden_states.view(B, F, N, C)
+                hidden_states = hidden_states.reshape(BXF, N, C)
+            if input_ndim == 4:
+                hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+            if attn.residual_connection:
+                hidden_states = hidden_states + residual
+            hidden_states = hidden_states / attn.rescale_output_factor
+            return hidden_states

scripts/cubemap_vae.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import torch
+from diffusers.models import AutoencoderKL
+from torch import nn
+from torchvision.transforms import ToPILImage
+from torch import Tensor
+class SynchronizedGroupNorm(nn.Module):
+    def __init__(self, original_group_norm: nn.GroupNorm, num_views: int = 6):
+        super().__init__()
+        self.num_views = num_views
+        # 继承原始分组参数
+        self.num_groups = original_group_norm.num_groups
+        self.num_channels = original_group_norm.num_channels
+        self.eps = original_group_norm.eps
+        # 按照通道组重构参数
+        self.group_size = self.num_channels // self.num_groups
+        # 继承原始参数（保持每个分组的仿射变换）
+        self.weight = nn.Parameter(original_group_norm.weight.detach().view(self.num_groups, self.group_size))
+        self.bias = nn.Parameter(original_group_norm.bias.detach().view(self.num_groups, self.group_size))
+    def forward(self, x: torch.Tensor):
+        """ 兼容 3D (B, C, D) 和 4D (B, C, H, W) 输入 """
+        #print(f"Input shape: {x.shape}")  # Debugging
+        # 获取输入的形状信息
+        BxT, C = x.shape[:2]  # 只取前两个维度
+        B = BxT // self.num_views  # 计算 batch 维度
+        # 处理 3D (B, C, D) 输入
+        if x.dim() == 3:
+            D = x.shape[2]
+            x = x.view(B, self.num_views, self.num_groups, self.group_size, D)
+            # 计算 GroupNorm
+            mean = x.mean(dim=(1, 3, 4), keepdim=True)
+            var = x.var(dim=(1, 3, 4), keepdim=True, unbiased=False)
+            x = (x - mean) / torch.sqrt(var + self.eps)
+            # **修正 weight 和 bias 的形状**
+            weight = self.weight.view(1, self.num_groups, self.group_size, 1)
+            bias = self.bias.view(1, self.num_groups, self.group_size, 1)
+            x = x * weight + bias
+            # 还原形状
+            return x.view(BxT, C, D)
+        # 处理 4D (B, C, H, W) 输入
+        elif x.dim() == 4:
+            H, W = x.shape[2:]
+            x = x.view(B, self.num_views, self.num_groups, self.group_size, H, W)
+            # 计算 GroupNorm
+            mean = x.mean(dim=(1, 3, 4, 5), keepdim=True)
+            var = x.var(dim=(1, 3, 4, 5), keepdim=True, unbiased=False)
+            x = (x - mean) / torch.sqrt(var + self.eps)
+            # **修正 weight 和 bias 的形状**
+            weight = self.weight.view(1, self.num_groups, self.group_size, 1, 1)
+            bias = self.bias.view(1, self.num_groups, self.group_size, 1, 1)
+            x = x * weight + bias
+            # 还原形状
+            return x.view(BxT, C, H, W)
+        else:
+            raise ValueError(f"Unsupported input shape: {x.shape}, expected 3D (B, C, D) or 4D (B, C, H, W).")
+class CubemapVAE(AutoencoderKL):
+    def __init__(self, pretrained_vae, num_views=6, in_channels=3,image_size=512):
+        super().__init__(  # 继承自 AutoencoderKL
+            act_fn="silu",
+            block_out_channels=[128, 256, 512, 512],
+            down_block_types=[
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D"
+            ],
+            up_block_types=[
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D"
+            ],
+            latent_channels=pretrained_vae.config.latent_channels,
+            in_channels=in_channels,
+            out_channels=in_channels
+        )
+        self.num_views = num_views
+        self.in_channels = in_channels
+        # --- 替换关键模块，适配 Cubemap ---
+        # 原 AutoencoderKL 的编码器不够灵活，直接覆盖编码器
+        #self.encoder = CubemapEncoder(pretrained_encoder=pretrained_vae.encoder,num_views=num_views, in_channels=in_channels)
+        #self.decoder = CubemapDecoder(pretrained_decoder=pretrained_vae.decoder, num_views=num_views, out_channels=in_channels,in_channels=4)
+        self.encoder=pretrained_vae.encoder
+        self.decoder=pretrained_vae.decoder
+        self.quant_conv=pretrained_vae.quant_conv
+        self.post_quant_conv=pretrained_vae.post_quant_conv
+        # 将原 GroupNorm 替换为同步 GroupNorm
+        replace_group_norm_with_sgn(self, num_views=num_views)
+    def encode(self, images,return_dict:bool=True):
+        batch_size, num_views, num_channels, height, width = images.shape
+        images = images.view(batch_size*num_views,num_channels, height, width)
+        return super().encode(images,return_dict=return_dict)
+    def decode(self, latents, return_dict=True, **kwargs):
+        """
+        自定义 VAE 解码:
+        - 去掉 UV 通道 (只保留前 4 个 latent 通道)
+        - 调用原始 VAE 解码流程
+        """
+        print("Decoder Recieve Latent Shape:", latents.shape)
+        # 确�� latents 至少有 4 个通道
+        if latents.shape[1] > 4:
+            latents = latents[:, :4, :, :]  # 只保留前 4 个通道，去掉 UV 通道
+        return super().decode(latents, return_dict=return_dict, **kwargs)
+    def decode_to_tensor(self, latents):
+        decoded = self.decode(latents).sample  # (B*6, 3, H, W)
+        B = latents.shape[0] // 6
+        images = torch.split(decoded, B, dim=0)  # 按 batch 拆分
+        return images  # Tuple of 6 tensors
+    def decode_to_pil_images(self, latents:Tensor):
+        images = self.decode_to_tensor(latents)  # 获取 6 张图
+        to_pil = ToPILImage()
+        return [to_pil(img[0].cpu().detach()) for img in images]  # 转换为 PIL
+def replace_group_norm_with_sgn(model, num_views):
+    """ 遍历 model，找到所有 GroupNorm 并替换成 SynchronizedGroupNorm """
+    replacements = []  # 先收集要替换的 module 名称
+    for name, module in model.named_modules():
+        if isinstance(module, nn.GroupNorm):
+            replacements.append(name)
+    for name in replacements:
+        parent_module, attr_name = get_parent_module(model, name)
+        setattr(parent_module, attr_name, SynchronizedGroupNorm(getattr(parent_module, attr_name), num_views))
+def get_parent_module(model, module_name):
+    """ 获取 `module_name` 所在的上一级 module 和属性名称 """
+    names = module_name.split(".")
+    parent_module = model
+    for name in names[:-1]:  # 遍历到倒数第二层
+        parent_module = getattr(parent_module, name)
+    return parent_module, names[-1]
+def flatten_face_names(face_names):
+    flat_face_names = []
+    for item in face_names:
+        if isinstance(item, str):  # 直接是字符串
+            flat_face_names.append(item)
+        elif isinstance(item, (list,tuple)):  # 是列表，展开其中的字符串
+            flat_face_names.extend(item)
+        else:
+            raise ValueError(f"Unexpected type in face_names: {type(item)}")
+    return flat_face_names

scripts/utils.py ADDED Viewed

	@@ -0,0 +1,576 @@

+import torch
+from torch import Tensor
+import torchvision.transforms.functional as TF
+import torch.nn.functional as F
+import cv2
+import py360convert
+import argparse
+import os
+import numpy as np
+from numpy.typing import NDArray
+from PIL import Image
+# 6 视角定义（前、后、左、右、上、下）
+FACES = ["front", "back", "left", "right", "top", "bottom"]
+FACE_CAPTION_MAP={
+            "front": "caption_front",
+            "back": "caption_back",
+            "left": "caption_left",
+            "right": "caption_right",
+            "top": "caption_top",
+            "bottom": "caption_bottom"
+        }
+FACE_KEYS_MAP = {
+            "front": "F",
+            "back": "B",
+            "left": "L",
+            "right": "R",
+            "top": "U",
+            "bottom": "D"
+        }
+def load_cubemap_dict(cubemap_path_dict:dict):
+    """
+    从字典中加载 Cubemap 图像
+    """
+    cubemap_dict = {}
+    for face, path in cubemap_path_dict.items():
+        image = cv2.imread(path)
+        if image is None:
+            print(f"❌ 读取失败: {path}")
+            continue
+        cubemap_dict[FACE_KEYS_MAP[face]] = image
+    return cubemap_dict
+def convert_to_cubemap(image, size=512):
+    """
+    使用 py360convert 将 equirectangular 全景图转换为 6 视角 Cubemap
+    """
+    cubemap_dict = py360convert.e2c(image, face_w=size, mode="bilinear", cube_format="dict")
+    return cubemap_dict
+def to_cubemap_dict(images:list[NDArray]):
+    cubemap_dict={}
+    for i, face in enumerate(FACES):
+        key=FACE_KEYS_MAP[face]
+        cubemap_dict[key]=images[i]
+    return cubemap_dict
+def convert_to_equirectangular(cubemap_dict, width=1024,height=512):
+    """
+    使用 py360convert 将 6 视角 Cubemap 转换为 equirectangular 全景图
+    """
+    equirectangular_image = py360convert.c2e(cubemap_dict, w=width,h=height, mode="bilinear",cube_format="dict")
+    if equirectangular_image.dtype == np.float32:
+        equirectangular_image = np.clip(equirectangular_image * 255, 0, 255).astype(np.uint8)
+    return Image.fromarray(equirectangular_image)
+def process_image_e2c(input_path, output_dir, size=512):
+    """
+    读取 equirectangular 全景图，转换为 Cubemap 并保存 6 张单独的图像
+    """
+    image = cv2.imread(input_path)
+    if image is None:
+        print(f"❌ 读取失败: {input_path}")
+        return
+    os.makedirs(output_dir, exist_ok=True)
+    # 生成 Cubemap
+    cubemap_images = convert_to_cubemap(image, size)
+    print(cubemap_images.keys())
+    # 保存 6 张图像
+    for face in FACES:
+        output_path = os.path.join(output_dir, f"{face}.png")
+        face_key = FACE_KEYS_MAP[face]
+        cv2.imwrite(output_path, cubemap_images[face_key])
+        print(f"✅ {face} 视角已保存: {output_path}")
+def process_image_c2e(cubemap_path_dict, output_path, width,height):
+    """
+    读取 6 视角 Cubemap，转换为 equirectangular 全景图并保存
+    """
+    cubemap_dict=load_cubemap_dict(cubemap_path_dict)
+    # 生成 equirectangular 全景图
+    equirectangular_image = convert_to_equirectangular(cubemap_dict, width,height)
+    cv2.imwrite(output_path, equirectangular_image)
+    print(f"✅ 全景图已保存: {output_path}")
+def perspective_transform_patch(patch: torch.Tensor, delta):
+    """
+    对输入的 patch 使用 torchvision.transforms.functional.perspective 进行透视变换。
+    参数:
+      patch: Tensor，形状 (C, H, W)，图像 patch
+      offset: float，表示左右方向的偏移量（单位：像素），用于定义目标透视变换的 endpoints
+              例如：正值表示上边向右平移，下边向左平移；负值则相反。
+    返回:
+      transformed: 透视变换后的 patch，Tensor，形状与 patch 相同
+    """
+    C, H, W = patch.shape
+    # 定义原始四个角的坐标（顺序为：上左, 上右, 下右, 下左）
+    startpoints = [
+        [0, 0],         # top-left
+        [W, 0],         # top-right
+        [W, H],         # bottom-right
+        [0, H]          # bottom-left
+    ]
+    endpoints=[[sp_i + d_i for sp_i, d_i in zip(sp, d)] for sp, d in zip(startpoints, delta)]
+    # 注意：F.perspective 接受的 startpoints 和 endpoints 应为 List[List[float]]
+    # 透视变换支持直接传入 tensor，但这里直接使用 list 即可。
+    return TF.perspective(patch, startpoints, endpoints, interpolation=TF.InterpolationMode.BILINEAR)
+def stretch_edge_patch(patch,pad_width,edge_key):
+    C,H,W=patch.shape
+    H_new=H + 2*pad_width
+    W_new=W + 2*pad_width
+    if edge_key=="top":
+        top_edge=TF.resize(patch,(pad_width,W_new))
+        delta_top = [
+            [0, 0],             # top-left
+            [0, 0],             # top-right
+            [-pad_width , 0],   # bottom-right
+            [pad_width , 0]     # bottom-left
+        ]
+        return perspective_transform_patch(top_edge, delta_top)
+    elif edge_key=="bottom":
+        bottom_edge=TF.resize(patch,(pad_width,W_new))
+        delta_bottom=[
+            [pad_width, 0],     # top-left
+            [-pad_width, 0],    # top-right
+            [0, 0],             # bottom-right
+            [0 , 0]             # bottom-left
+        ]
+        return perspective_transform_patch(bottom_edge,delta_bottom)
+    elif edge_key=="left":
+        left_edge=TF.resize(patch,(H_new,pad_width))
+        delta_left=[
+            [0, 0],             # top-left 变为 (offset, 0)
+            [0, pad_width],             # top-right 变为 (W+offset, 0)
+            [0, -pad_width],   # bottom-right 变为 (W-offset, H)
+            [0 , 0]     # bottom-left 变为 (-offset, H)
+        ]
+        return perspective_transform_patch(left_edge,delta_left)
+    elif edge_key=="right":
+        right_edge=TF.resize(patch,(H_new,pad_width))
+        delta_right=[
+            [0, pad_width],             # top-left 变为 (offset, 0)
+            [0, 0],             # top-right 变为 (W+offset, 0)
+            [0, 0],   # bottom-right 变为 (W-offset, H)
+            [0 , -pad_width]     # bottom-left 变为 (-offset, H)
+        ]
+        return perspective_transform_patch(right_edge,delta_right)
+# -------------------------------
+# 定义各面拼接函数
+# -------------------------------
+def pad_front(face:Tensor, faces, pad_width):
+    """
+    对前视图进行边缘拼接：
+      上侧：拼接上视图的下边缘 (取 top[:, -w:, :])
+      下侧：拼接下视图的上边缘 (取 bottom[:, :w, :])
+      左侧：拼接左视图的右边缘 (取 left[:, :, -w:])
+      右侧：拼接右视图的左边缘 (取 right[:, :, :w])
+    """
+    C, H, W = face.shape
+    H_new=H + 2*pad_width
+    W_new=W + 2*pad_width
+    padded = torch.zeros((C, H_new, W_new), dtype=face.dtype, device=face.device)
+    # 中间放置前视图
+    padded[:, pad_width:pad_width+H, pad_width:pad_width+W] = face
+    # 上边缘
+    top_edge=faces['top'][:, -pad_width:, :]
+    padded[:, 0:pad_width, 0:W+2*pad_width] += stretch_edge_patch(top_edge,pad_width,"top")
+    # 下边缘
+    bottom_edge=faces['bottom'][:, :pad_width, :]
+    padded[:, H+pad_width:H+2*pad_width, :] += stretch_edge_patch(bottom_edge,pad_width,"bottom")
+    # 左边缘
+    left_edge=faces['left'][:, :, -pad_width:]
+    padded[:, :, 0:pad_width] += stretch_edge_patch(left_edge,pad_width,"left")
+    # 右边缘
+    right_edge=faces['right'][:, :, :pad_width]
+    padded[:, :, W+pad_width:W+2*pad_width] += stretch_edge_patch(right_edge,pad_width,"right")
+    return padded
+def pad_right(face, faces, pad_width):
+    """
+    对右视图进行边缘拼接：
+      左侧：拼接前视图的右边缘 (front[:, :, -w:])
+      右侧：拼接后视图的左边缘 (back[:, :, :w])
+      上侧：拼接上视图的右边缘 (top[:, :, -w:])
+      下侧：拼接下视图的右边缘 (bottom[:, :, -w:])
+    """
+    C, H, W = face.shape
+    H_new=H + 2*pad_width
+    W_new=W + 2*pad_width
+    padded = torch.zeros((C, H_new, W_new), dtype=face.dtype, device=face.device)
+    padded[:, pad_width:pad_width+H, pad_width:pad_width+W] = face
+    left_edge=faces['front'][:, :, -pad_width:]
+    padded[:, :, 0:pad_width] += stretch_edge_patch(left_edge,pad_width,"left")
+    right_edge=faces['back'][:, :, :pad_width]
+    padded[:, :, W+pad_width:W+2*pad_width] += stretch_edge_patch(right_edge,pad_width,"right")
+    # 上侧：拼接上视图的右边缘，顺时针旋转90度
+    # 原始 top 边缘为 shape (C, H, w) ，旋转后变为 (C, w, H)
+    top_edge = torch.rot90(faces['top'][:, :, -pad_width:], k=3, dims=(1,2))
+    padded[:, 0:pad_width, :] += stretch_edge_patch(top_edge,pad_width,"top")
+    # 下侧：拼接下视图的右边缘，逆时针旋转90度
+    # 原始 bottom 边缘为 shape (C, H, w) ，旋转后变为 (C, w, H)
+    bottom_edge = torch.rot90(faces['bottom'][:, :, -pad_width:], k=1, dims=(1,2))
+    padded[:, H+pad_width:H+2*pad_width, :] += stretch_edge_patch(bottom_edge,pad_width,"bottom")
+    return padded
+def pad_back(face, faces, pad_width):
+    """
+    对后视图进行边缘拼接：
+      左侧：拼接右视图的右边缘 (right[:, :, -w:])
+      右侧：拼接左视图的左边缘 (left[:, :, :w])
+      上侧：拼接上视图的上边缘 (top[:, :w, :])
+      下侧：拼接下视图的下边缘 (bottom[:, -w:, :])
+    """
+    C, H, W = face.shape
+    H_new=H + 2*pad_width
+    W_new=W + 2*pad_width
+    padded = torch.zeros((C, H_new, W_new), dtype=face.dtype, device=face.device)
+    padded[:, pad_width:pad_width+H, pad_width:pad_width+W] = face
+    left_edge=faces['right'][:, :, -pad_width:]
+    padded[:, :, 0:pad_width] += stretch_edge_patch(left_edge,pad_width,"left")
+    right_edge=faces['left'][:, :, :pad_width]
+    padded[:, :, W+pad_width:W+2*pad_width] += stretch_edge_patch(right_edge,pad_width,"right")
+    # 上侧：���用上视图的上边缘，并旋转180度
+    # 旋转180度可使用 torch.rot90(..., k=2, dims=(1,2))
+    top_edge = torch.rot90(faces['top'][:, :pad_width, :], k=2, dims=(1,2))
+    padded[:, 0:pad_width, :] +=stretch_edge_patch(top_edge,pad_width,"top")
+    # 下侧：使用下视图的下边缘，并旋转180度
+    bottom_edge = torch.rot90(faces['bottom'][:, -pad_width:, :], k=2, dims=(1,2))
+    padded[:, H+pad_width:H+2*pad_width, :] += stretch_edge_patch(bottom_edge,pad_width,"bottom")
+    return padded
+def pad_left(face, faces, pad_width):
+    """
+    对左视图进行边缘拼接：
+      左侧：拼接后视图的右边缘 (back[:, :, -w:])
+      右侧：拼接前视图的左边缘 (front[:, :, :w])
+      上侧：拼接上视图的左边缘 (top[:, :, :w])
+      下侧：拼接下视图的左边缘 (bottom[:, :, :w])
+    """
+    C, H, W = face.shape
+    padded = torch.zeros((C, H + 2*pad_width, W + 2*pad_width), dtype=face.dtype, device=face.device)
+    padded[:, pad_width:pad_width+H, pad_width:pad_width+W] = face
+    left_edge=faces['back'][:, :, -pad_width:]
+    padded[:, :, 0:pad_width] += stretch_edge_patch(left_edge,pad_width,"left")
+    right_edge=faces['front'][:, :, :pad_width]
+    padded[:, :, W+pad_width:W+2*pad_width] += stretch_edge_patch(right_edge,pad_width,"right")
+    top_edge=torch.rot90(faces['top'][:, :, :pad_width],k=1,dims=(1,2))
+    padded[:, 0:pad_width, :] += stretch_edge_patch(top_edge,pad_width,"top")
+    bottom_edge=torch.rot90(faces['bottom'][:, :, :pad_width],k=3,dims=(1,2))
+    padded[:, H+pad_width:H+2*pad_width, :] += stretch_edge_patch(bottom_edge,pad_width,"bottom")
+    return padded
+def pad_top(face, faces, pad_width):
+    """
+    对上视图进行边缘拼接：
+      下侧：拼接前视图的上边缘 (front[:, :w, :])
+      左侧：拼接左视图的上边缘 (left[:, :w, :])
+      右侧：拼接右视图的上边缘 (right[:, :w, :])
+      上侧：拼接后视图的上边缘 (back[:, :w, :])
+    """
+    C, H, W = face.shape
+    padded = torch.zeros((C, H + 2*pad_width, W + 2*pad_width), dtype=face.dtype, device=face.device)
+    padded[:, pad_width:pad_width+H, pad_width:pad_width+W] = face
+    bottom_edge=faces['front'][:, :pad_width, :]
+    padded[:, H+pad_width:H+2*pad_width, :] +=stretch_edge_patch(bottom_edge,pad_width,"bottom")
+    left_edge=torch.rot90(faces['left'][:, :pad_width, :],k=3,dims=(1,2))
+    padded[:, :, 0:pad_width] += stretch_edge_patch(left_edge,pad_width,"left")
+    right_edge=torch.rot90(faces['right'][:, :pad_width, :],k=1,dims=(1,2))
+    padded[:, :, W+pad_width:W+2*pad_width]+=stretch_edge_patch(right_edge,pad_width,"right")
+    top_edge=torch.rot90(faces['back'][:, :pad_width, :], k=2, dims=(1,2))
+    padded[:, 0:pad_width, :] +=stretch_edge_patch(top_edge,pad_width,"top")
+    return padded
+def pad_bottom(face, faces, pad_width):
+    """
+    对下视图进行边缘拼接：
+      上侧：拼接前视图的下边缘 (front[:, -w:, :])
+      左侧：拼接左视图的下边缘 (left[:, -w:, :])
+      右侧：拼接右视图的下边缘 (right[:, -w:, :])
+      下侧：拼接后视图的下边缘 (back[:, :-w, :])
+    """
+    C, H, W = face.shape
+    padded = torch.zeros((C, H + 2*pad_width, W + 2*pad_width), dtype=face.dtype, device=face.device)
+    padded[:, pad_width:pad_width+H, pad_width:pad_width+W] = face
+    top_edge=faces['front'][:, -pad_width:, :]
+    padded[:, 0:pad_width, :] += stretch_edge_patch(top_edge,pad_width,"top")
+    left_edge=torch.rot90(faces['left'][:, -pad_width:, :],k=1,dims=(1,2))
+    padded[:, :, 0:pad_width] += stretch_edge_patch(left_edge,pad_width,"left")
+    right_edge=torch.rot90(faces['right'][:, -pad_width:, :],k=3,dims=(1,2))
+    padded[:, :, W+pad_width:W+2*pad_width] += stretch_edge_patch(right_edge,pad_width,"right")
+    bottom_edge=torch.rot90(faces['back'][:, -pad_width:, :],k=2,dims=(1,2))
+    padded[:, H+pad_width:H+2*pad_width, :] += stretch_edge_patch(bottom_edge,pad_width,"bottom")
+    return padded
+pad_funcs = {
+        "front": pad_front,
+        "right": pad_right,
+        "back": pad_back,
+        "left": pad_left,
+        "top": pad_top,
+        "bottom": pad_bottom,
+    }
+def pad_face(faces: dict, width: int, face_name: str)->Tensor:
+    """
+    根据 face_name 调用对应的拼接函数
+    """
+    if face_name not in pad_funcs:
+        raise ValueError(f"Invalid face name: {face_name}. Must be one of {list(pad_funcs.keys())}.")
+    return pad_funcs[face_name](faces[face_name], faces, width)
+def prepare_mask(image,facename):
+    """
+    根据 facename 为每张图生成对应的 mask。
+    如果 facename 为 "front"，mask 全部置为 1，其它置为 0。
+    生成的 mask 形状为 (1, H, W)，即与图像的高度和宽度一致，但只有 1 个通道。
+    参数:
+        image (torch.Tensor): 图像 tensor，形状应为 (C, H, W) 或者 (N, C, H, W) 中的单张图像
+        facename (str): 表示图像对应的面名称，例如 "front", "back" 等
+    返回:
+        torch.Tensor: 生成的 mask，形状为 (1, H, W)
+    """
+    # 如果 image 是 (C, H, W)，那么 H=image.shape[1], W=image.shape[2]
+    # 如果 image 是 (N, C, H, W)，可以使用 image[0] 取得一张图像的尺寸
+    if image.ndim == 3:
+        H, W = image.shape[1], image.shape[2]
+    elif image.ndim == 4:
+        H, W = image.shape[2], image.shape[3]
+    else:
+        raise ValueError("Unsupported image shape")
+    mask_shape = (1, H, W)
+    if facename == "front":
+        return torch.zeros(mask_shape, dtype=image.dtype, device=image.device)
+    else:
+        return torch.ones(mask_shape, dtype=image.dtype, device=image.device)
+def generate_cubemap_uv(H, W):
+    """ 生成 cube face 上每个点的 3D 归一化坐标 (x, y, z) 并计算 UV 映射 """
+    H=int(H)
+    W=int(W)
+    # 生成 [-1,1] 范围的 grid（cube face 上的 x, y 坐标）
+    u_range = torch.linspace(-1, 1, W).view(1, -1).expand(H, -1)  # HxW
+    v_range = torch.linspace(-1, 1, H).view(-1, 1).expand(-1, W)  # HxW
+    # 设定六个面 (x, y, z) 归一化坐标
+    faces = {
+        "front":  (u_range, v_range, torch.ones_like(u_range)),  # (x, y, z=1)
+        "back":   (-u_range, v_range, -torch.ones_like(u_range)),  # (-x, y, z=-1)
+        "left":   (-torch.ones_like(u_range), v_range, u_range),  # (-1, y, -x)
+        "right":  (torch.ones_like(u_range), v_range, -u_range),  # (1, y, x)
+        "top":    (u_range, -torch.ones_like(u_range), v_range),  # (x, 1, y)
+        "bottom": (u_range, torch.ones_like(u_range), -v_range),  # (x, -1, -y)
+    }
+    # 计算六个面的 UV
+    uv_faces = {}
+    for face, (x, y, z) in faces.items():
+        u = torch.atan2(x, z)/(2*torch.pi)+0.5
+        v = torch.atan2(y, torch.sqrt(x ** 2 + z ** 2))/(2*torch.pi)+0.5
+        uv_faces[face] = torch.stack([u,v], dim=0)  # shape: (2, H, W)
+    return uv_faces  # 返回每个面的 UV 坐标
+import torch
+def generate_cubemap_uv_padding(H, W, padding_pixels=0):
+    """ 生成 cube face 上每个点的 3D 归一化坐标 (x, y, z) 并计算 UV 映射，支持自定义 padding """
+    H = int(H)
+    W = int(W)
+    # 计算 padding 的比例
+    padding_ratio = padding_pixels / W  # 例如 50 / 512 ≈ 0.0977
+    # 计算扩展后的尺寸
+    H_new = H + 2 * padding_pixels
+    W_new = W + 2 * padding_pixels
+    # 生成扩展范围的 grid（从 [-1-padding_ratio, 1+padding_ratio]）
+    u_range = torch.linspace(-1 - padding_ratio, 1 + padding_ratio, W_new).view(1, -1).expand(H_new, -1)
+    v_range = torch.linspace(-1 - padding_ratio, 1 + padding_ratio, H_new).view(-1, 1).expand(-1, W_new)
+    # 定义六个面的 3D 归一化坐标
+    faces = {
+        "front":  (u_range, v_range, torch.ones_like(u_range)),
+        "back":   (-u_range, v_range, -torch.ones_like(u_range)),
+        "left":   (-torch.ones_like(u_range), v_range, u_range),
+        "right":  (torch.ones_like(u_range), v_range, -u_range),
+        "top":    (u_range, -torch.ones_like(u_range), v_range),
+        "bottom": (u_range, torch.ones_like(u_range), -v_range),
+    }
+    # 计算六个面的 UV
+    uv_faces = {}
+    for face, (x, y, z) in faces.items():
+        u = torch.atan2(x, z) / (2 * torch.pi) + 0.5
+        v = torch.atan2(y, torch.sqrt(x ** 2 + z ** 2)) / (2 * torch.pi) + 0.5
+        uv = torch.stack([u, v], dim=0)  # shape: (2, H_new, W_new)
+        # 使用双线性插值将 UV resize 回 (2, H, W)
+        uv_resized = F.interpolate(uv.unsqueeze(0), size=(H, W), mode='bilinear', align_corners=True).squeeze(0)
+        uv_faces[face] = uv_resized
+    return uv_faces
+def merge_uv_with_latent(latent, uv_maps,dim=1):
+    # 调整 uv_maps 的大小，使其与 latent 的空间尺寸一致
+    # 注意：这里采用双线性插值，并设置 align_corners=False
+    uv_maps_resized = F.interpolate(uv_maps, size=latent.shape[-2:], mode="bilinear", align_corners=False)
+    # 在通道维度上拼接，即 dim=1
+    latent_with_uv = torch.cat([latent, uv_maps_resized], dim=dim)
+    return latent_with_uv
+def resize_and_crop(image: np.ndarray, padding: int) -> np.ndarray:
+    """
+    先将输入的图片 resize 到 (H + padding * 2, W + padding * 2)，
+    然后再剪裁掉外侧四个边缘各 padding 宽度，恢复到原来的 H, W。
+    参数:
+        image (np.ndarray): 输入的图片，形状为 (H, W, C)。
+        padding (int): 需要添加的边界宽度。
+    返回:
+        np.ndarray: 处理后的图片，形状仍为 (H, W, C)。
+    """
+    if not isinstance(image, np.ndarray):
+        raise ValueError("输入图片必须是 numpy 数组格式")
+    H, W = image.shape[:2]  # 获取原始尺寸
+    # Step 1: Resize 到 (H + padding * 2, W + padding * 2)
+    resized_image = cv2.resize(image, (W + 2 * padding, H + 2 * padding), interpolation=cv2.INTER_LINEAR)
+    # Step 2: 裁剪掉外�� padding 的宽度，恢复到原来的 (H, W)
+    cropped_image = resized_image[padding:H + padding, padding:W + padding]
+    return cropped_image  # 返回 numpy.ndarray
+def cubemap_unfold(cubemaps,H:int=512,W:int=512,channels:int=3,transparent:bool=False)->Image.Image:
+    # 拼接成 3x4 的布局
+    # 整体画布尺寸：3 行，每行 H 像素；4 列，每列 W 像素
+    canvas_H = 3 * H
+    canvas_W = 4 * W
+    num_channels=channels if transparent==False else channels+1
+    # 确保 canvas 也是正确的形状
+    canvas = np.zeros(shape=(canvas_H, canvas_W, num_channels), dtype=cubemaps[0].dtype)
+    if channels==1:
+        canvas=np.squeeze(canvas, axis=-1)
+    face_imgs = {face: cubemaps[i] for i, face in enumerate(FACES)}
+    alpha_layer=num_channels-1
+    # 布局安排（以 0 为起始索引）：
+    # 第一行：只在 (0,1) 位置放 top
+    # 第二行：依次为 left, front, right, back（对应列 0,1,2,3）
+    # 第三行：只在 (2,1) 位置放 bottom
+    # 将 top 放在第一行第二列
+    row, col = 0, 1
+    if channels==1:
+        canvas[row*H:(row+1)*H, col*W:(col+1)*W,0] = face_imgs['top']
+    else:
+        canvas[row*H:(row+1)*H, col*W:(col+1)*W,:channels] = face_imgs['top']
+    if transparent:
+        canvas[row * H:(row + 1) * H, col * W:(col + 1) * W, alpha_layer] = 255  # Set alpha to opaque
+    # 将 left, front, right, back 分别放在第二行（行索引 1）从列 0 到 3
+    row = 1
+    for i, face in enumerate(['left', 'front', 'right', 'back']):
+        col = i  # 分别放在第 0,1,2,3 列
+        if channels==1:
+             canvas[row*H:(row+1)*H, col*W:(col+1)*W,0] = face_imgs[face]
+        else:
+            canvas[row*H:(row+1)*H, col*W:(col+1)*W,:channels] = face_imgs[face]
+        if transparent:
+            canvas[row * H:(row + 1) * H, col * W:(col + 1) * W, alpha_layer] = 255
+    # 将 bottom 放在第三行第二列
+    row, col = 2, 1
+    if channels==1:
+        canvas[row*H:(row+1)*H, col*W:(col+1)*W,0] = face_imgs['bottom']
+    else:
+        canvas[row*H:(row+1)*H, col*W:(col+1)*W,:channels] = face_imgs['bottom']
+    if transparent:
+        canvas[row * H:(row + 1) * H, col * W:(col + 1) * W, alpha_layer] = 255  # Set alpha to opaque
+    if channels==1:
+        return Image.fromarray(canvas,mode="L")
+    if np.issubdtype(canvas.dtype, np.floating):
+        canvas = np.clip(canvas * 255, 0, 255).astype(np.uint8)
+    return Image.fromarray(canvas)

viewer.html ADDED Viewed

	@@ -0,0 +1,65 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <title>全景图查看器</title>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.css"/>
+    <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.js"></script>
+    <style>
+        html, body {
+            margin: 0;
+            padding: 0;
+            width: 100%;
+            height: 100%;
+            overflow: hidden;
+        }
+        #panorama {
+            width: 100%;
+            height: 100%;
+        }
+        .pnlm-load-button {
+            display: none !important;
+        }
+    </style>
+</head>
+<body>
+    <div id="panorama"></div>
+    <script>
+        // 监听来自父窗口的消息
+        window.addEventListener('message', function(event) {
+            if (event.data.type === 'loadPanorama') {
+                console.log('Received image URL:', event.data.image);
+                // 销毁现有的查看器（如果存在）
+                if (window.viewer) {
+                    window.viewer.destroy();
+                }
+                // 创建新的查看器
+                window.viewer = pannellum.viewer('panorama', {
+                    type: 'equirectangular',
+                    panorama: event.data.image,
+                    autoLoad: true,
+                    autoRotate: -2,
+                    compass: true,
+                    northOffset: 0,
+                    showFullscreenCtrl: true,
+                    showControls: true,
+                    mouseZoom: true,
+                    draggable: true,
+                    friction: 0.2,
+                    minHfov: 50,
+                    maxHfov: 120,
+                    hfov: 100,
+                    onLoad: function() {
+                        console.log('Panorama loaded successfully');
+                    },
+                    onError: function(error) {
+                        console.error('Error loading panorama:', error);
+                    }
+                });
+            }
+        });
+    </script>
+</body>
+</html>