zimhe
add scripts and examples
a521a3f
import torch
from torch import Tensor
import torchvision.transforms.functional as TF
import torch.nn.functional as F
import cv2
import py360convert
import argparse
import os
import numpy as np
from numpy.typing import NDArray
from PIL import Image
# 6 视角定义(前、后、左、右、上、下)
FACES = ["front", "back", "left", "right", "top", "bottom"]
FACE_CAPTION_MAP={
"front": "caption_front",
"back": "caption_back",
"left": "caption_left",
"right": "caption_right",
"top": "caption_top",
"bottom": "caption_bottom"
}
FACE_KEYS_MAP = {
"front": "F",
"back": "B",
"left": "L",
"right": "R",
"top": "U",
"bottom": "D"
}
def load_cubemap_dict(cubemap_path_dict:dict):
"""
从字典中加载 Cubemap 图像
"""
cubemap_dict = {}
for face, path in cubemap_path_dict.items():
image = cv2.imread(path)
if image is None:
print(f"❌ 读取失败: {path}")
continue
cubemap_dict[FACE_KEYS_MAP[face]] = image
return cubemap_dict
def convert_to_cubemap(image, size=512):
"""
使用 py360convert 将 equirectangular 全景图转换为 6 视角 Cubemap
"""
cubemap_dict = py360convert.e2c(image, face_w=size, mode="bilinear", cube_format="dict")
return cubemap_dict
def to_cubemap_dict(images:list[NDArray]):
cubemap_dict={}
for i, face in enumerate(FACES):
key=FACE_KEYS_MAP[face]
cubemap_dict[key]=images[i]
return cubemap_dict
def convert_to_equirectangular(cubemap_dict, width=1024,height=512):
"""
使用 py360convert 将 6 视角 Cubemap 转换为 equirectangular 全景图
"""
equirectangular_image = py360convert.c2e(cubemap_dict, w=width,h=height, mode="bilinear",cube_format="dict")
if equirectangular_image.dtype == np.float32:
equirectangular_image = np.clip(equirectangular_image * 255, 0, 255).astype(np.uint8)
return Image.fromarray(equirectangular_image)
def process_image_e2c(input_path, output_dir, size=512):
"""
读取 equirectangular 全景图,转换为 Cubemap 并保存 6 张单独的图像
"""
image = cv2.imread(input_path)
if image is None:
print(f"❌ 读取失败: {input_path}")
return
os.makedirs(output_dir, exist_ok=True)
# 生成 Cubemap
cubemap_images = convert_to_cubemap(image, size)
print(cubemap_images.keys())
# 保存 6 张图像
for face in FACES:
output_path = os.path.join(output_dir, f"{face}.png")
face_key = FACE_KEYS_MAP[face]
cv2.imwrite(output_path, cubemap_images[face_key])
print(f"✅ {face} 视角已保存: {output_path}")
def process_image_c2e(cubemap_path_dict, output_path, width,height):
"""
读取 6 视角 Cubemap,转换为 equirectangular 全景图并保存
"""
cubemap_dict=load_cubemap_dict(cubemap_path_dict)
# 生成 equirectangular 全景图
equirectangular_image = convert_to_equirectangular(cubemap_dict, width,height)
cv2.imwrite(output_path, equirectangular_image)
print(f"✅ 全景图已保存: {output_path}")
def perspective_transform_patch(patch: torch.Tensor, delta):
"""
对输入的 patch 使用 torchvision.transforms.functional.perspective 进行透视变换。
参数:
patch: Tensor,形状 (C, H, W),图像 patch
offset: float,表示左右方向的偏移量(单位:像素),用于定义目标透视变换的 endpoints
例如:正值表示上边向右平移,下边向左平移;负值则相反。
返回:
transformed: 透视变换后的 patch,Tensor,形状与 patch 相同
"""
C, H, W = patch.shape
# 定义原始四个角的坐标(顺序为:上左, 上右, 下右, 下左)
startpoints = [
[0, 0], # top-left
[W, 0], # top-right
[W, H], # bottom-right
[0, H] # bottom-left
]
endpoints=[[sp_i + d_i for sp_i, d_i in zip(sp, d)] for sp, d in zip(startpoints, delta)]
# 注意:F.perspective 接受的 startpoints 和 endpoints 应为 List[List[float]]
# 透视变换支持直接传入 tensor,但这里直接使用 list 即可。
return TF.perspective(patch, startpoints, endpoints, interpolation=TF.InterpolationMode.BILINEAR)
def stretch_edge_patch(patch,pad_width,edge_key):
C,H,W=patch.shape
H_new=H + 2*pad_width
W_new=W + 2*pad_width
if edge_key=="top":
top_edge=TF.resize(patch,(pad_width,W_new))
delta_top = [
[0, 0], # top-left
[0, 0], # top-right
[-pad_width , 0], # bottom-right
[pad_width , 0] # bottom-left
]
return perspective_transform_patch(top_edge, delta_top)
elif edge_key=="bottom":
bottom_edge=TF.resize(patch,(pad_width,W_new))
delta_bottom=[
[pad_width, 0], # top-left
[-pad_width, 0], # top-right
[0, 0], # bottom-right
[0 , 0] # bottom-left
]
return perspective_transform_patch(bottom_edge,delta_bottom)
elif edge_key=="left":
left_edge=TF.resize(patch,(H_new,pad_width))
delta_left=[
[0, 0], # top-left 变为 (offset, 0)
[0, pad_width], # top-right 变为 (W+offset, 0)
[0, -pad_width], # bottom-right 变为 (W-offset, H)
[0 , 0] # bottom-left 变为 (-offset, H)
]
return perspective_transform_patch(left_edge,delta_left)
elif edge_key=="right":
right_edge=TF.resize(patch,(H_new,pad_width))
delta_right=[
[0, pad_width], # top-left 变为 (offset, 0)
[0, 0], # top-right 变为 (W+offset, 0)
[0, 0], # bottom-right 变为 (W-offset, H)
[0 , -pad_width] # bottom-left 变为 (-offset, H)
]
return perspective_transform_patch(right_edge,delta_right)
# -------------------------------
# 定义各面拼接函数
# -------------------------------
def pad_front(face:Tensor, faces, pad_width):
"""
对前视图进行边缘拼接:
上侧:拼接上视图的下边缘 (取 top[:, -w:, :])
下侧:拼接下视图的上边缘 (取 bottom[:, :w, :])
左侧:拼接左视图的右边缘 (取 left[:, :, -w:])
右侧:拼接右视图的左边缘 (取 right[:, :, :w])
"""
C, H, W = face.shape
H_new=H + 2*pad_width
W_new=W + 2*pad_width
padded = torch.zeros((C, H_new, W_new), dtype=face.dtype, device=face.device)
# 中间放置前视图
padded[:, pad_width:pad_width+H, pad_width:pad_width+W] = face
# 上边缘
top_edge=faces['top'][:, -pad_width:, :]
padded[:, 0:pad_width, 0:W+2*pad_width] += stretch_edge_patch(top_edge,pad_width,"top")
# 下边缘
bottom_edge=faces['bottom'][:, :pad_width, :]
padded[:, H+pad_width:H+2*pad_width, :] += stretch_edge_patch(bottom_edge,pad_width,"bottom")
# 左边缘
left_edge=faces['left'][:, :, -pad_width:]
padded[:, :, 0:pad_width] += stretch_edge_patch(left_edge,pad_width,"left")
# 右边缘
right_edge=faces['right'][:, :, :pad_width]
padded[:, :, W+pad_width:W+2*pad_width] += stretch_edge_patch(right_edge,pad_width,"right")
return padded
def pad_right(face, faces, pad_width):
"""
对右视图进行边缘拼接:
左侧:拼接前视图的右边缘 (front[:, :, -w:])
右侧:拼接后视图的左边缘 (back[:, :, :w])
上侧:拼接上视图的右边缘 (top[:, :, -w:])
下侧:拼接下视图的右边缘 (bottom[:, :, -w:])
"""
C, H, W = face.shape
H_new=H + 2*pad_width
W_new=W + 2*pad_width
padded = torch.zeros((C, H_new, W_new), dtype=face.dtype, device=face.device)
padded[:, pad_width:pad_width+H, pad_width:pad_width+W] = face
left_edge=faces['front'][:, :, -pad_width:]
padded[:, :, 0:pad_width] += stretch_edge_patch(left_edge,pad_width,"left")
right_edge=faces['back'][:, :, :pad_width]
padded[:, :, W+pad_width:W+2*pad_width] += stretch_edge_patch(right_edge,pad_width,"right")
# 上侧:拼接上视图的右边缘,顺时针旋转90度
# 原始 top 边缘为 shape (C, H, w) ,旋转后变为 (C, w, H)
top_edge = torch.rot90(faces['top'][:, :, -pad_width:], k=3, dims=(1,2))
padded[:, 0:pad_width, :] += stretch_edge_patch(top_edge,pad_width,"top")
# 下侧:拼接下视图的右边缘,逆时针旋转90度
# 原始 bottom 边缘为 shape (C, H, w) ,旋转后变为 (C, w, H)
bottom_edge = torch.rot90(faces['bottom'][:, :, -pad_width:], k=1, dims=(1,2))
padded[:, H+pad_width:H+2*pad_width, :] += stretch_edge_patch(bottom_edge,pad_width,"bottom")
return padded
def pad_back(face, faces, pad_width):
"""
对后视图进行边缘拼接:
左侧:拼接右视图的右边缘 (right[:, :, -w:])
右侧:拼接左视图的左边缘 (left[:, :, :w])
上侧:拼接上视图的上边缘 (top[:, :w, :])
下侧:拼接下视图的下边缘 (bottom[:, -w:, :])
"""
C, H, W = face.shape
H_new=H + 2*pad_width
W_new=W + 2*pad_width
padded = torch.zeros((C, H_new, W_new), dtype=face.dtype, device=face.device)
padded[:, pad_width:pad_width+H, pad_width:pad_width+W] = face
left_edge=faces['right'][:, :, -pad_width:]
padded[:, :, 0:pad_width] += stretch_edge_patch(left_edge,pad_width,"left")
right_edge=faces['left'][:, :, :pad_width]
padded[:, :, W+pad_width:W+2*pad_width] += stretch_edge_patch(right_edge,pad_width,"right")
# 上侧:使用上视图的上边缘,并旋转180度
# 旋转180度可使用 torch.rot90(..., k=2, dims=(1,2))
top_edge = torch.rot90(faces['top'][:, :pad_width, :], k=2, dims=(1,2))
padded[:, 0:pad_width, :] +=stretch_edge_patch(top_edge,pad_width,"top")
# 下侧:使用下视图的下边缘,并旋转180度
bottom_edge = torch.rot90(faces['bottom'][:, -pad_width:, :], k=2, dims=(1,2))
padded[:, H+pad_width:H+2*pad_width, :] += stretch_edge_patch(bottom_edge,pad_width,"bottom")
return padded
def pad_left(face, faces, pad_width):
"""
对左视图进行边缘拼接:
左侧:拼接后视图的右边缘 (back[:, :, -w:])
右侧:拼接前视图的左边缘 (front[:, :, :w])
上侧:拼接上视图的左边缘 (top[:, :, :w])
下侧:拼接下视图的左边缘 (bottom[:, :, :w])
"""
C, H, W = face.shape
padded = torch.zeros((C, H + 2*pad_width, W + 2*pad_width), dtype=face.dtype, device=face.device)
padded[:, pad_width:pad_width+H, pad_width:pad_width+W] = face
left_edge=faces['back'][:, :, -pad_width:]
padded[:, :, 0:pad_width] += stretch_edge_patch(left_edge,pad_width,"left")
right_edge=faces['front'][:, :, :pad_width]
padded[:, :, W+pad_width:W+2*pad_width] += stretch_edge_patch(right_edge,pad_width,"right")
top_edge=torch.rot90(faces['top'][:, :, :pad_width],k=1,dims=(1,2))
padded[:, 0:pad_width, :] += stretch_edge_patch(top_edge,pad_width,"top")
bottom_edge=torch.rot90(faces['bottom'][:, :, :pad_width],k=3,dims=(1,2))
padded[:, H+pad_width:H+2*pad_width, :] += stretch_edge_patch(bottom_edge,pad_width,"bottom")
return padded
def pad_top(face, faces, pad_width):
"""
对上视图进行边缘拼接:
下侧:拼接前视图的上边缘 (front[:, :w, :])
左侧:拼接左视图的上边缘 (left[:, :w, :])
右侧:拼接右视图的上边缘 (right[:, :w, :])
上侧:拼接后视图的上边缘 (back[:, :w, :])
"""
C, H, W = face.shape
padded = torch.zeros((C, H + 2*pad_width, W + 2*pad_width), dtype=face.dtype, device=face.device)
padded[:, pad_width:pad_width+H, pad_width:pad_width+W] = face
bottom_edge=faces['front'][:, :pad_width, :]
padded[:, H+pad_width:H+2*pad_width, :] +=stretch_edge_patch(bottom_edge,pad_width,"bottom")
left_edge=torch.rot90(faces['left'][:, :pad_width, :],k=3,dims=(1,2))
padded[:, :, 0:pad_width] += stretch_edge_patch(left_edge,pad_width,"left")
right_edge=torch.rot90(faces['right'][:, :pad_width, :],k=1,dims=(1,2))
padded[:, :, W+pad_width:W+2*pad_width]+=stretch_edge_patch(right_edge,pad_width,"right")
top_edge=torch.rot90(faces['back'][:, :pad_width, :], k=2, dims=(1,2))
padded[:, 0:pad_width, :] +=stretch_edge_patch(top_edge,pad_width,"top")
return padded
def pad_bottom(face, faces, pad_width):
"""
对下视图进行边缘拼接:
上侧:拼接前视图的下边缘 (front[:, -w:, :])
左侧:拼接左视图的下边缘 (left[:, -w:, :])
右侧:拼接右视图的下边缘 (right[:, -w:, :])
下侧:拼接后视图的下边缘 (back[:, :-w, :])
"""
C, H, W = face.shape
padded = torch.zeros((C, H + 2*pad_width, W + 2*pad_width), dtype=face.dtype, device=face.device)
padded[:, pad_width:pad_width+H, pad_width:pad_width+W] = face
top_edge=faces['front'][:, -pad_width:, :]
padded[:, 0:pad_width, :] += stretch_edge_patch(top_edge,pad_width,"top")
left_edge=torch.rot90(faces['left'][:, -pad_width:, :],k=1,dims=(1,2))
padded[:, :, 0:pad_width] += stretch_edge_patch(left_edge,pad_width,"left")
right_edge=torch.rot90(faces['right'][:, -pad_width:, :],k=3,dims=(1,2))
padded[:, :, W+pad_width:W+2*pad_width] += stretch_edge_patch(right_edge,pad_width,"right")
bottom_edge=torch.rot90(faces['back'][:, -pad_width:, :],k=2,dims=(1,2))
padded[:, H+pad_width:H+2*pad_width, :] += stretch_edge_patch(bottom_edge,pad_width,"bottom")
return padded
pad_funcs = {
"front": pad_front,
"right": pad_right,
"back": pad_back,
"left": pad_left,
"top": pad_top,
"bottom": pad_bottom,
}
def pad_face(faces: dict, width: int, face_name: str)->Tensor:
"""
根据 face_name 调用对应的拼接函数
"""
if face_name not in pad_funcs:
raise ValueError(f"Invalid face name: {face_name}. Must be one of {list(pad_funcs.keys())}.")
return pad_funcs[face_name](faces[face_name], faces, width)
def prepare_mask(image,facename):
"""
根据 facename 为每张图生成对应的 mask。
如果 facename 为 "front",mask 全部置为 1,其它置为 0。
生成的 mask 形状为 (1, H, W),即与图像的高度和宽度一致,但只有 1 个通道。
参数:
image (torch.Tensor): 图像 tensor,形状应为 (C, H, W) 或者 (N, C, H, W) 中的单张图像
facename (str): 表示图像对应的面名称,例如 "front", "back" 等
返回:
torch.Tensor: 生成的 mask,形状为 (1, H, W)
"""
# 如果 image 是 (C, H, W),那么 H=image.shape[1], W=image.shape[2]
# 如果 image 是 (N, C, H, W),可以使用 image[0] 取得一张图像的尺寸
if image.ndim == 3:
H, W = image.shape[1], image.shape[2]
elif image.ndim == 4:
H, W = image.shape[2], image.shape[3]
else:
raise ValueError("Unsupported image shape")
mask_shape = (1, H, W)
if facename == "front":
return torch.zeros(mask_shape, dtype=image.dtype, device=image.device)
else:
return torch.ones(mask_shape, dtype=image.dtype, device=image.device)
def generate_cubemap_uv(H, W):
""" 生成 cube face 上每个点的 3D 归一化坐标 (x, y, z) 并计算 UV 映射 """
H=int(H)
W=int(W)
# 生成 [-1,1] 范围的 grid(cube face 上的 x, y 坐标)
u_range = torch.linspace(-1, 1, W).view(1, -1).expand(H, -1) # HxW
v_range = torch.linspace(-1, 1, H).view(-1, 1).expand(-1, W) # HxW
# 设定六个面 (x, y, z) 归一化坐标
faces = {
"front": (u_range, v_range, torch.ones_like(u_range)), # (x, y, z=1)
"back": (-u_range, v_range, -torch.ones_like(u_range)), # (-x, y, z=-1)
"left": (-torch.ones_like(u_range), v_range, u_range), # (-1, y, -x)
"right": (torch.ones_like(u_range), v_range, -u_range), # (1, y, x)
"top": (u_range, -torch.ones_like(u_range), v_range), # (x, 1, y)
"bottom": (u_range, torch.ones_like(u_range), -v_range), # (x, -1, -y)
}
# 计算六个面的 UV
uv_faces = {}
for face, (x, y, z) in faces.items():
u = torch.atan2(x, z)/(2*torch.pi)+0.5
v = torch.atan2(y, torch.sqrt(x ** 2 + z ** 2))/(2*torch.pi)+0.5
uv_faces[face] = torch.stack([u,v], dim=0) # shape: (2, H, W)
return uv_faces # 返回每个面的 UV 坐标
import torch
def generate_cubemap_uv_padding(H, W, padding_pixels=0):
""" 生成 cube face 上每个点的 3D 归一化坐标 (x, y, z) 并计算 UV 映射,支持自定义 padding """
H = int(H)
W = int(W)
# 计算 padding 的比例
padding_ratio = padding_pixels / W # 例如 50 / 512 ≈ 0.0977
# 计算扩展后的尺寸
H_new = H + 2 * padding_pixels
W_new = W + 2 * padding_pixels
# 生成扩展范围的 grid(从 [-1-padding_ratio, 1+padding_ratio])
u_range = torch.linspace(-1 - padding_ratio, 1 + padding_ratio, W_new).view(1, -1).expand(H_new, -1)
v_range = torch.linspace(-1 - padding_ratio, 1 + padding_ratio, H_new).view(-1, 1).expand(-1, W_new)
# 定义六个面的 3D 归一化坐标
faces = {
"front": (u_range, v_range, torch.ones_like(u_range)),
"back": (-u_range, v_range, -torch.ones_like(u_range)),
"left": (-torch.ones_like(u_range), v_range, u_range),
"right": (torch.ones_like(u_range), v_range, -u_range),
"top": (u_range, -torch.ones_like(u_range), v_range),
"bottom": (u_range, torch.ones_like(u_range), -v_range),
}
# 计算六个面的 UV
uv_faces = {}
for face, (x, y, z) in faces.items():
u = torch.atan2(x, z) / (2 * torch.pi) + 0.5
v = torch.atan2(y, torch.sqrt(x ** 2 + z ** 2)) / (2 * torch.pi) + 0.5
uv = torch.stack([u, v], dim=0) # shape: (2, H_new, W_new)
# 使用双线性插值将 UV resize 回 (2, H, W)
uv_resized = F.interpolate(uv.unsqueeze(0), size=(H, W), mode='bilinear', align_corners=True).squeeze(0)
uv_faces[face] = uv_resized
return uv_faces
def merge_uv_with_latent(latent, uv_maps,dim=1):
# 调整 uv_maps 的大小,使其与 latent 的空间尺寸一致
# 注意:这里采用双线性插值,并设置 align_corners=False
uv_maps_resized = F.interpolate(uv_maps, size=latent.shape[-2:], mode="bilinear", align_corners=False)
# 在通道维度上拼接,即 dim=1
latent_with_uv = torch.cat([latent, uv_maps_resized], dim=dim)
return latent_with_uv
def resize_and_crop(image: np.ndarray, padding: int) -> np.ndarray:
"""
先将输入的图片 resize 到 (H + padding * 2, W + padding * 2),
然后再剪裁掉外侧四个边缘各 padding 宽度,恢复到原来的 H, W。
参数:
image (np.ndarray): 输入的图片,形状为 (H, W, C)。
padding (int): 需要添加的边界宽度。
返回:
np.ndarray: 处理后的图片,形状仍为 (H, W, C)。
"""
if not isinstance(image, np.ndarray):
raise ValueError("输入图片必须是 numpy 数组格式")
H, W = image.shape[:2] # 获取原始尺寸
# Step 1: Resize 到 (H + padding * 2, W + padding * 2)
resized_image = cv2.resize(image, (W + 2 * padding, H + 2 * padding), interpolation=cv2.INTER_LINEAR)
# Step 2: 裁剪掉外侧 padding 的宽度,恢复到原来的 (H, W)
cropped_image = resized_image[padding:H + padding, padding:W + padding]
return cropped_image # 返回 numpy.ndarray
def cubemap_unfold(cubemaps,H:int=512,W:int=512,channels:int=3,transparent:bool=False)->Image.Image:
# 拼接成 3x4 的布局
# 整体画布尺寸:3 行,每行 H 像素;4 列,每列 W 像素
canvas_H = 3 * H
canvas_W = 4 * W
num_channels=channels if transparent==False else channels+1
# 确保 canvas 也是正确的形状
canvas = np.zeros(shape=(canvas_H, canvas_W, num_channels), dtype=cubemaps[0].dtype)
if channels==1:
canvas=np.squeeze(canvas, axis=-1)
face_imgs = {face: cubemaps[i] for i, face in enumerate(FACES)}
alpha_layer=num_channels-1
# 布局安排(以 0 为起始索引):
# 第一行:只在 (0,1) 位置放 top
# 第二行:依次为 left, front, right, back(对应列 0,1,2,3)
# 第三行:只在 (2,1) 位置放 bottom
# 将 top 放在第一行第二列
row, col = 0, 1
if channels==1:
canvas[row*H:(row+1)*H, col*W:(col+1)*W,0] = face_imgs['top']
else:
canvas[row*H:(row+1)*H, col*W:(col+1)*W,:channels] = face_imgs['top']
if transparent:
canvas[row * H:(row + 1) * H, col * W:(col + 1) * W, alpha_layer] = 255 # Set alpha to opaque
# 将 left, front, right, back 分别放在第二行(行索引 1)从列 0 到 3
row = 1
for i, face in enumerate(['left', 'front', 'right', 'back']):
col = i # 分别放在第 0,1,2,3 列
if channels==1:
canvas[row*H:(row+1)*H, col*W:(col+1)*W,0] = face_imgs[face]
else:
canvas[row*H:(row+1)*H, col*W:(col+1)*W,:channels] = face_imgs[face]
if transparent:
canvas[row * H:(row + 1) * H, col * W:(col + 1) * W, alpha_layer] = 255
# 将 bottom 放在第三行第二列
row, col = 2, 1
if channels==1:
canvas[row*H:(row+1)*H, col*W:(col+1)*W,0] = face_imgs['bottom']
else:
canvas[row*H:(row+1)*H, col*W:(col+1)*W,:channels] = face_imgs['bottom']
if transparent:
canvas[row * H:(row + 1) * H, col * W:(col + 1) * W, alpha_layer] = 255 # Set alpha to opaque
if channels==1:
return Image.fromarray(canvas,mode="L")
if np.issubdtype(canvas.dtype, np.floating):
canvas = np.clip(canvas * 255, 0, 255).astype(np.uint8)
return Image.fromarray(canvas)