EvanEternal commited on Dec 8, 2025

Commit

08bf07d

verified ·

1 Parent(s): 9ecdc6d

Upload 86 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

icons/move_backward.png +0 -0
icons/move_forward.png +0 -0
icons/move_left.png +0 -0
icons/move_right.png +0 -0
icons/not_move_backward.png +0 -0
icons/not_move_forward.png +0 -0
icons/not_move_left.png +0 -0
icons/not_move_right.png +0 -0
icons/not_turn_down.png +0 -0
icons/not_turn_left.png +0 -0
icons/not_turn_right.png +0 -0
icons/not_turn_up.png +0 -0
icons/turn_down.png +0 -0
icons/turn_left.png +0 -0
icons/turn_right.png +0 -0
icons/turn_up.png +0 -0
models/Astra/checkpoints/Put ReCamMaster ckpt file here.txt +0 -0
models/Astra/checkpoints/README.md +5 -0
scripts/add_text_emb.py +161 -0
scripts/add_text_emb_rl.py +161 -0
scripts/add_text_emb_spatialvid.py +173 -0
scripts/analyze_openx.py +243 -0
scripts/analyze_pose.py +188 -0
scripts/batch_drone.py +44 -0
scripts/batch_infer.py +186 -0
scripts/batch_nus.py +42 -0
scripts/batch_rt.py +41 -0
scripts/batch_spa.py +43 -0
scripts/batch_walk.py +42 -0
scripts/check.py +263 -0
scripts/decode_openx.py +428 -0
scripts/download_recam.py +7 -0
scripts/download_wan2.1.py +5 -0
scripts/encode_dynamic_videos.py +141 -0
scripts/encode_openx.py +466 -0
scripts/encode_rlbench_video.py +170 -0
scripts/encode_sekai_video.py +162 -0
scripts/encode_sekai_walking.py +249 -0
scripts/encode_spatialvid.py +409 -0
scripts/encode_spatialvid_first_frame.py +285 -0
scripts/hud_logo.py +40 -0
scripts/infer_demo.py +1458 -0
scripts/infer_moe.py +1023 -0
scripts/infer_moe_spatialvid.py +1008 -0
scripts/infer_moe_test.py +976 -0
scripts/infer_nus.py +500 -0
scripts/infer_openx.py +614 -0
scripts/infer_origin.py +1108 -0
scripts/infer_recam.py +272 -0
scripts/infer_rlbench.py +447 -0

icons/move_backward.png ADDED Viewed

icons/move_forward.png ADDED Viewed

icons/move_left.png ADDED Viewed

icons/move_right.png ADDED Viewed

icons/not_move_backward.png ADDED Viewed

icons/not_move_forward.png ADDED Viewed

icons/not_move_left.png ADDED Viewed

icons/not_move_right.png ADDED Viewed

icons/not_turn_down.png ADDED Viewed

icons/not_turn_left.png ADDED Viewed

icons/not_turn_right.png ADDED Viewed

icons/not_turn_up.png ADDED Viewed

icons/turn_down.png ADDED Viewed

icons/turn_left.png ADDED Viewed

icons/turn_right.png ADDED Viewed

icons/turn_up.png ADDED Viewed

models/Astra/checkpoints/Put ReCamMaster ckpt file here.txt ADDED Viewed

File without changes

models/Astra/checkpoints/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+---
+license: apache-2.0
+---
+# ReCamMaster: Camera-Controlled Generative Rendering from A Single Video
+Please refer to the [Github](https://github.com/KwaiVGI/ReCamMaster) README for usage.

scripts/add_text_emb.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import torch
+import lightning as pl
+from PIL import Image
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import json
+import imageio
+from torchvision.transforms import v2
+from einops import rearrange
+import argparse
+import numpy as np
+import pdb
+from tqdm import tqdm
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+class VideoEncoder(pl.LightningModule):
+    def __init__(self, text_encoder_path, vae_path, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        super().__init__()
+        model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+        model_manager.load_models([text_encoder_path, vae_path])
+        self.pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager)
+        self.tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        self.frame_process = v2.Compose([
+            # v2.CenterCrop(size=(900, 1600)),
+            # v2.Resize(size=(900, 1600), antialias=True),
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+    def crop_and_resize(self, image):
+        width, height = image.size
+        # print(width,height)
+        width_ori, height_ori_ = 832 , 480
+        image = v2.functional.resize(
+            image,
+            (round(height_ori_), round(width_ori)),
+            interpolation=v2.InterpolationMode.BILINEAR
+        )
+        return image
+    def load_video_frames(self, video_path):
+        """加载完整视频"""
+        reader = imageio.get_reader(video_path)
+        frames = []
+        for frame_data in reader:
+            frame = Image.fromarray(frame_data)
+            frame = self.crop_and_resize(frame)
+            frame = self.frame_process(frame)
+            frames.append(frame)
+        reader.close()
+        if len(frames) == 0:
+            return None
+        frames = torch.stack(frames, dim=0)
+        frames = rearrange(frames, "T C H W -> C T H W")
+        return frames
+def encode_scenes(scenes_path, text_encoder_path, vae_path,output_dir):
+    """编码所有场景的视频"""
+    encoder = VideoEncoder(text_encoder_path, vae_path)
+    encoder = encoder.cuda()
+    encoder.pipe.device = "cuda"
+    processed_count = 0
+    prompt_emb = 0
+    os.makedirs(output_dir,exist_ok=True)
+    required_keys = ["latents", "cam_emb", "prompt_emb"]
+    for i, scene_name in tqdm(enumerate(os.listdir(scenes_path)),total=len(os.listdir(scenes_path))):
+        scene_dir = os.path.join(scenes_path, scene_name)
+        save_dir = os.path.join(output_dir,scene_name.split('.')[0])
+        # print('in:',scene_dir)
+        # print('out:',save_dir)
+        # 检查是否已编码
+        encoded_path = os.path.join(save_dir, "encoded_video.pth")
+        # if os.path.exists(encoded_path):
+        print(f"Checking scene {scene_name}...")
+        #     continue
+        # 加载场景信息
+        # print(encoded_path)
+        data = torch.load(encoded_path,weights_only=False)
+        missing_keys = [key for key in required_keys if key not in data]
+        if missing_keys:
+            print(f"警告: 文件中缺少以下必要元素: {missing_keys}")
+        else:
+            print("文件包含所有必要元素: latents 和 cam_emb 和 prompt_emb")
+            continue
+        # with np.load(scene_cam_path) as data:
+        #     cam_data = data.files
+        #     cam_emb = {k: data[k].cpu() if isinstance(data[k], torch.Tensor) else data[k] for k in cam_data}
+        # with open(scene_cam_path, 'rb') as f:
+        #     cam_data = np.load(f)  # 此时cam_data仅包含数据，无文件句柄引用
+        # 加载和编码视频
+        # video_frames = encoder.load_video_frames(video_path)
+        # if video_frames is None:
+        #     print(f"Failed to load video: {video_path}")
+        #     continue
+        # video_frames = video_frames.unsqueeze(0).to("cuda", dtype=torch.bfloat16)
+        # print(video_frames.shape)
+        # 编码视频
+        with torch.no_grad():
+            # latents = encoder.pipe.encode_video(video_frames, **encoder.tiler_kwargs)[0]
+            # 编码文本
+            if processed_count == 0:
+                print('encode prompt!!!')
+                prompt_emb = encoder.pipe.encode_prompt("A video of a scene shot using a pedestrian's front camera while walking")#A video of a scene shot using a drone's front camera
+                del encoder.pipe.prompter
+            data["prompt_emb"] = prompt_emb
+            print("已添加/更新 prompt_emb 元素")
+            # 保存修改后的文件（可改为新路径避免覆盖原文件）
+            torch.save(data, encoded_path)
+            # pdb.set_trace()
+            # 保存编码结果
+            print(f"Saved encoded data: {encoded_path}")
+            processed_count += 1
+        # except Exception as e:
+        #     print(f"Error encoding scene {scene_name}: {e}")
+        #     continue
+    print(processed_count)
+    print(f"Encoding completed! Processed {processed_count} scenes.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--scenes_path", type=str, default="/share_zhuyixuan05/zhuyixuan05/sekai-game-walking")
+    parser.add_argument("--text_encoder_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth")
+    parser.add_argument("--vae_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth")
+    parser.add_argument("--output_dir",type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/sekai-game-walking")
+    args = parser.parse_args()
+    encode_scenes(args.scenes_path, args.text_encoder_path, args.vae_path,args.output_dir)

scripts/add_text_emb_rl.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import torch
+import lightning as pl
+from PIL import Image
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import json
+import imageio
+from torchvision.transforms import v2
+from einops import rearrange
+import argparse
+import numpy as np
+import pdb
+from tqdm import tqdm
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+class VideoEncoder(pl.LightningModule):
+    def __init__(self, text_encoder_path, vae_path, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        super().__init__()
+        model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+        model_manager.load_models([text_encoder_path, vae_path])
+        self.pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager)
+        self.tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        self.frame_process = v2.Compose([
+            # v2.CenterCrop(size=(900, 1600)),
+            # v2.Resize(size=(900, 1600), antialias=True),
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+    def crop_and_resize(self, image):
+        width, height = image.size
+        # print(width,height)
+        width_ori, height_ori_ = 832 , 480
+        image = v2.functional.resize(
+            image,
+            (round(height_ori_), round(width_ori)),
+            interpolation=v2.InterpolationMode.BILINEAR
+        )
+        return image
+    def load_video_frames(self, video_path):
+        """加载完整视频"""
+        reader = imageio.get_reader(video_path)
+        frames = []
+        for frame_data in reader:
+            frame = Image.fromarray(frame_data)
+            frame = self.crop_and_resize(frame)
+            frame = self.frame_process(frame)
+            frames.append(frame)
+        reader.close()
+        if len(frames) == 0:
+            return None
+        frames = torch.stack(frames, dim=0)
+        frames = rearrange(frames, "T C H W -> C T H W")
+        return frames
+def encode_scenes(scenes_path, text_encoder_path, vae_path,output_dir):
+    """编码所有场景的视频"""
+    encoder = VideoEncoder(text_encoder_path, vae_path)
+    encoder = encoder.cuda()
+    encoder.pipe.device = "cuda"
+    processed_count = 0
+    prompt_emb = 0
+    os.makedirs(output_dir,exist_ok=True)
+    required_keys = ["latents", "cam_emb", "prompt_emb"]
+    for i, scene_name in tqdm(enumerate(os.listdir(scenes_path)),total=len(os.listdir(scenes_path))):
+        scene_dir = os.path.join(scenes_path, scene_name)
+        save_dir = os.path.join(output_dir,scene_name.split('.')[0])
+        # print('in:',scene_dir)
+        # print('out:',save_dir)
+        # 检查是否已编码
+        encoded_path = os.path.join(save_dir, "encoded_video.pth")
+        # if os.path.exists(encoded_path):
+        print(f"Checking scene {scene_name}...")
+        #     continue
+        # 加载场景信息
+        # print(encoded_path)
+        data = torch.load(encoded_path,weights_only=False)
+        missing_keys = [key for key in required_keys if key not in data]
+        if missing_keys:
+            print(f"警告: 文件中缺少以下必要元素: {missing_keys}")
+        else:
+            print("文件包含所有必要元素: latents 和 cam_emb 和 prompt_emb")
+            continue
+        # with np.load(scene_cam_path) as data:
+        #     cam_data = data.files
+        #     cam_emb = {k: data[k].cpu() if isinstance(data[k], torch.Tensor) else data[k] for k in cam_data}
+        # with open(scene_cam_path, 'rb') as f:
+        #     cam_data = np.load(f)  # 此时cam_data仅包含数据，无文件句柄引用
+        # 加载和编码视频
+        # video_frames = encoder.load_video_frames(video_path)
+        # if video_frames is None:
+        #     print(f"Failed to load video: {video_path}")
+        #     continue
+        # video_frames = video_frames.unsqueeze(0).to("cuda", dtype=torch.bfloat16)
+        # print(video_frames.shape)
+        # 编码视频
+        with torch.no_grad():
+            # latents = encoder.pipe.encode_video(video_frames, **encoder.tiler_kwargs)[0]
+            # 编码文本
+            if processed_count == 0:
+                print('encode prompt!!!')
+                prompt_emb = encoder.pipe.encode_prompt("a robotic arm executing precise manipulation tasks on a clean, organized desk")#A video of a scene shot using a drone's front camera + “A video of a scene shot using a pedestrian's front camera while walking”
+                del encoder.pipe.prompter
+            data["prompt_emb"] = prompt_emb
+            print("已添加/更新 prompt_emb 元素")
+            # 保存修改后的文件（可改为新路径避免覆盖原文件）
+            torch.save(data, encoded_path)
+            # pdb.set_trace()
+            # 保存编码结果
+            print(f"Saved encoded data: {encoded_path}")
+            processed_count += 1
+        # except Exception as e:
+        #     print(f"Error encoding scene {scene_name}: {e}")
+        #     continue
+    print(processed_count)
+    print(f"Encoding completed! Processed {processed_count} scenes.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--scenes_path", type=str, default="/share_zhuyixuan05/zhuyixuan05/rlbench")
+    parser.add_argument("--text_encoder_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth")
+    parser.add_argument("--vae_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth")
+    parser.add_argument("--output_dir",type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/rlbench")
+    args = parser.parse_args()
+    encode_scenes(args.scenes_path, args.text_encoder_path, args.vae_path,args.output_dir)

scripts/add_text_emb_spatialvid.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import os
+import torch
+import lightning as pl
+from PIL import Image
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import json
+import imageio
+from torchvision.transforms import v2
+from einops import rearrange
+import argparse
+import numpy as np
+import pdb
+from tqdm import tqdm
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+class VideoEncoder(pl.LightningModule):
+    def __init__(self, text_encoder_path, vae_path, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        super().__init__()
+        model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+        model_manager.load_models([text_encoder_path, vae_path])
+        self.pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager)
+        self.tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        self.frame_process = v2.Compose([
+            # v2.CenterCrop(size=(900, 1600)),
+            # v2.Resize(size=(900, 1600), antialias=True),
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+    def crop_and_resize(self, image):
+        width, height = image.size
+        # print(width,height)
+        width_ori, height_ori_ = 832 , 480
+        image = v2.functional.resize(
+            image,
+            (round(height_ori_), round(width_ori)),
+            interpolation=v2.InterpolationMode.BILINEAR
+        )
+        return image
+    def load_video_frames(self, video_path):
+        """加载完整视频"""
+        reader = imageio.get_reader(video_path)
+        frames = []
+        for frame_data in reader:
+            frame = Image.fromarray(frame_data)
+            frame = self.crop_and_resize(frame)
+            frame = self.frame_process(frame)
+            frames.append(frame)
+        reader.close()
+        if len(frames) == 0:
+            return None
+        frames = torch.stack(frames, dim=0)
+        frames = rearrange(frames, "T C H W -> C T H W")
+        return frames
+def encode_scenes(scenes_path, text_encoder_path, vae_path,output_dir):
+    """编码所有场景的视频"""
+    encoder = VideoEncoder(text_encoder_path, vae_path)
+    encoder = encoder.cuda()
+    encoder.pipe.device = "cuda"
+    processed_count = 0
+    prompt_emb = 0
+    os.makedirs(output_dir,exist_ok=True)
+    required_keys = ["latents", "cam_emb", "prompt_emb"]
+    for i, scene_name in tqdm(enumerate(os.listdir(scenes_path)),total=len(os.listdir(scenes_path))):
+        scene_dir = os.path.join(scenes_path, scene_name)
+        save_dir = os.path.join(output_dir,scene_name.split('.')[0])
+        # print('in:',scene_dir)
+        # print('out:',save_dir)
+        # 检查是否已编码
+        encoded_path = os.path.join(save_dir, "encoded_video.pth")
+        # if os.path.exists(encoded_path):
+        # print(f"Checking scene {scene_name}...")
+        #     continue
+        # 加载场景信息
+        # print(encoded_path)
+        data = torch.load(encoded_path,weights_only=False,
+                    map_location="cpu")
+        missing_keys = [key for key in required_keys if key not in data]
+        if missing_keys:
+            print(f"警告: 文件 {encoded_path} 中缺少以下必要元素: {missing_keys}")
+        # else:
+        #     # print("文件包含所有必要元素: latents 和 cam_emb 和 prompt_emb")
+        #     continue
+        # pdb.set_trace()
+        if data['prompt_emb']['context'].requires_grad:
+            print(f"警告: 文件 {encoded_path} 中存在含梯度变量，已消除")
+            data['prompt_emb']['context'] = data['prompt_emb']['context'].detach().clone()
+            # 双重保险：显式关闭梯度
+            data['prompt_emb']['context'].requires_grad_(False)
+            # 验证是否成功（可选）
+            assert not data['prompt_emb']['context'].requires_grad, "梯度仍未消除！"
+            torch.save(data, encoded_path)
+        # with np.load(scene_cam_path) as data:
+        #     cam_data = data.files
+        #     cam_emb = {k: data[k].cpu() if isinstance(data[k], torch.Tensor) else data[k] for k in cam_data}
+        # with open(scene_cam_path, 'rb') as f:
+        #     cam_data = np.load(f)  # 此时cam_data仅包含数据，无文件句柄引用
+        # 加载和编码视频
+        # video_frames = encoder.load_video_frames(video_path)
+        # if video_frames is None:
+        #     print(f"Failed to load video: {video_path}")
+        #     continue
+        # video_frames = video_frames.unsqueeze(0).to("cuda", dtype=torch.bfloat16)
+        # print(video_frames.shape)
+        # 编码视频
+        '''with torch.no_grad():
+            # latents = encoder.pipe.encode_video(video_frames, **encoder.tiler_kwargs)[0]
+            # 编码文本
+            if processed_count == 0:
+                print('encode prompt!!!')
+                prompt_emb = encoder.pipe.encode_prompt("A video of a scene shot using a pedestrian's front camera while walking")#A video of a scene shot using a drone's front camera
+                del encoder.pipe.prompter
+            data["prompt_emb"] = prompt_emb
+            print("已添加/更新 prompt_emb 元素")
+            # 保存修改后的文件（可改为新路径避免覆盖原文件）
+            torch.save(data, encoded_path)
+            # pdb.set_trace()
+            # 保存编码结果
+        print(f"Saved encoded data: {encoded_path}")'''
+        processed_count += 1
+        # except Exception as e:
+        #     print(f"Error encoding scene {scene_name}: {e}")
+        #     continue
+    print(processed_count)
+    print(f"Encoding completed! Processed {processed_count} scenes.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--scenes_path", type=str, default="/share_zhuyixuan05/zhuyixuan05/spatialvid")
+    parser.add_argument("--text_encoder_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth")
+    parser.add_argument("--vae_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth")
+    parser.add_argument("--output_dir",type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/spatialvid")
+    args = parser.parse_args()
+    encode_scenes(args.scenes_path, args.text_encoder_path, args.vae_path,args.output_dir)

scripts/analyze_openx.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import os
+import torch
+from tqdm import tqdm
+def analyze_openx_dataset_frame_counts(dataset_path):
+    """分析OpenX数据集中的帧数分布"""
+    print(f"🔧 分析OpenX数据集: {dataset_path}")
+    if not os.path.exists(dataset_path):
+        print(f"  ⚠️ 路径不存在: {dataset_path}")
+        return
+    episode_dirs = []
+    total_episodes = 0
+    valid_episodes = 0
+    # 收集所有episode目录
+    for item in os.listdir(dataset_path):
+        episode_dir = os.path.join(dataset_path, item)
+        if os.path.isdir(episode_dir):
+            total_episodes += 1
+            encoded_path = os.path.join(episode_dir, "encoded_video.pth")
+            if os.path.exists(encoded_path):
+                episode_dirs.append(episode_dir)
+                valid_episodes += 1
+    print(f"📊 总episode数: {total_episodes}")
+    print(f"📊 有效episode数: {valid_episodes}")
+    if len(episode_dirs) == 0:
+        print("❌ 没有找到有效的episode")
+        return
+    # 统计帧数分布
+    frame_counts = []
+    less_than_10 = 0
+    less_than_8 = 0
+    less_than_5 = 0
+    error_count = 0
+    print("🔧 开始分析帧数分布...")
+    for episode_dir in tqdm(episode_dirs, desc="分析episodes"):
+        try:
+            encoded_data = torch.load(
+                os.path.join(episode_dir, "encoded_video.pth"),
+                weights_only=False,
+                map_location="cpu"
+            )
+            latents = encoded_data['latents']  # [C, T, H, W]
+            frame_count = latents.shape[1]  # T维度
+            frame_counts.append(frame_count)
+            if frame_count < 10:
+                less_than_10 += 1
+            if frame_count < 8:
+                less_than_8 += 1
+            if frame_count < 5:
+                less_than_5 += 1
+        except Exception as e:
+            error_count += 1
+            if error_count <= 5:  # 只打印前5个错误
+                print(f"❌ 加载episode {os.path.basename(episode_dir)} 时出错: {e}")
+    # 统计结果
+    total_valid = len(frame_counts)
+    print(f"\n📈 帧数分布统计:")
+    print(f"  总有效episodes: {total_valid}")
+    print(f"  错误episodes: {error_count}")
+    print(f"  最小帧数: {min(frame_counts) if frame_counts else 0}")
+    print(f"  最大帧数: {max(frame_counts) if frame_counts else 0}")
+    print(f"  平均帧数: {sum(frame_counts) / len(frame_counts):.2f}" if frame_counts else 0)
+    print(f"\n🎯 关键统计:")
+    print(f"  帧数 < 5:  {less_than_5:6d} episodes ({less_than_5/total_valid*100:.2f}%)")
+    print(f"  帧数 < 8:  {less_than_8:6d} episodes ({less_than_8/total_valid*100:.2f}%)")
+    print(f"  帧数 < 10: {less_than_10:6d} episodes ({less_than_10/total_valid*100:.2f}%)")
+    print(f"  帧数 >= 10: {total_valid-less_than_10:6d} episodes ({(total_valid-less_than_10)/total_valid*100:.2f}%)")
+    # 详细分布
+    frame_counts.sort()
+    print(f"\n📊 详细帧数分布:")
+    # 按范围统计
+    ranges = [
+        (1, 4, "1-4帧"),
+        (5, 7, "5-7帧"),
+        (8, 9, "8-9帧"),
+        (10, 19, "10-19帧"),
+        (20, 49, "20-49帧"),
+        (50, 99, "50-99帧"),
+        (100, float('inf'), "100+帧")
+    ]
+    for min_f, max_f, label in ranges:
+        count = sum(1 for f in frame_counts if min_f <= f <= max_f)
+        percentage = count / total_valid * 100
+        print(f"  {label:8s}: {count:6d} episodes ({percentage:5.2f}%)")
+    # 建议的训练配置
+    print(f"\n💡 训练配置建议:")
+    time_compression_ratio = 4
+    min_condition_compressed = 4 // time_compression_ratio  # 1帧
+    target_frames_compressed = 32 // time_compression_ratio  # 8帧
+    min_required_compressed = min_condition_compressed + target_frames_compressed  # 9帧
+    usable_episodes = sum(1 for f in frame_counts if f >= min_required_compressed)
+    usable_percentage = usable_episodes / total_valid * 100
+    print(f"  最小条件帧数(压缩后): {min_condition_compressed}")
+    print(f"  目标帧数(压缩后): {target_frames_compressed}")
+    print(f"  最小所需帧数(压缩后): {min_required_compressed}")
+    print(f"  可用于训练的episodes: {usable_episodes} ({usable_percentage:.2f}%)")
+    # 保存详细统计到文件
+    output_file = os.path.join(dataset_path, "frame_count_analysis.txt")
+    with open(output_file, 'w') as f:
+        f.write(f"OpenX Dataset Frame Count Analysis\n")
+        f.write(f"Dataset Path: {dataset_path}\n")
+        f.write(f"Analysis Date: {__import__('datetime').datetime.now()}\n\n")
+        f.write(f"Total Episodes: {total_episodes}\n")
+        f.write(f"Valid Episodes: {total_valid}\n")
+        f.write(f"Error Episodes: {error_count}\n\n")
+        f.write(f"Frame Count Statistics:\n")
+        f.write(f"  Min Frames: {min(frame_counts) if frame_counts else 0}\n")
+        f.write(f"  Max Frames: {max(frame_counts) if frame_counts else 0}\n")
+        f.write(f"  Avg Frames: {sum(frame_counts) / len(frame_counts):.2f}\n\n" if frame_counts else "  Avg Frames: 0\n\n")
+        f.write(f"Key Statistics:\n")
+        f.write(f"  < 5 frames:  {less_than_5} ({less_than_5/total_valid*100:.2f}%)\n")
+        f.write(f"  < 8 frames:  {less_than_8} ({less_than_8/total_valid*100:.2f}%)\n")
+        f.write(f"  < 10 frames: {less_than_10} ({less_than_10/total_valid*100:.2f}%)\n")
+        f.write(f"  >= 10 frames: {total_valid-less_than_10} ({(total_valid-less_than_10)/total_valid*100:.2f}%)\n\n")
+        f.write(f"Detailed Distribution:\n")
+        for min_f, max_f, label in ranges:
+            count = sum(1 for f in frame_counts if min_f <= f <= max_f)
+            percentage = count / total_valid * 100
+            f.write(f"  {label}: {count} ({percentage:.2f}%)\n")
+        f.write(f"\nTraining Configuration Recommendation:\n")
+        f.write(f"  Usable Episodes (>= {min_required_compressed} compressed frames): {usable_episodes} ({usable_percentage:.2f}%)\n")
+        # 写入所有帧数
+        f.write(f"\nAll Frame Counts:\n")
+        for i, count in enumerate(frame_counts):
+            f.write(f"{count}")
+            if (i + 1) % 20 == 0:
+                f.write("\n")
+            else:
+                f.write(", ")
+    print(f"\n💾 详细统计已保存到: {output_file}")
+    return {
+        'total_valid': total_valid,
+        'less_than_10': less_than_10,
+        'less_than_8': less_than_8,
+        'less_than_5': less_than_5,
+        'frame_counts': frame_counts,
+        'usable_episodes': usable_episodes
+    }
+def quick_sample_analysis(dataset_path, sample_size=1000):
+    """快速采样分析，用于大数据集的初步估计"""
+    print(f"🚀 快速采样分析 (样本数: {sample_size})")
+    episode_dirs = []
+    for item in os.listdir(dataset_path):
+        episode_dir = os.path.join(dataset_path, item)
+        if os.path.isdir(episode_dir):
+            encoded_path = os.path.join(episode_dir, "encoded_video.pth")
+            if os.path.exists(encoded_path):
+                episode_dirs.append(episode_dir)
+    if len(episode_dirs) == 0:
+        print("❌ 没有找到有效的episode")
+        return
+    # 随机采样
+    import random
+    sample_dirs = random.sample(episode_dirs, min(sample_size, len(episode_dirs)))
+    frame_counts = []
+    less_than_10 = 0
+    for episode_dir in tqdm(sample_dirs, desc="采样分析"):
+        try:
+            encoded_data = torch.load(
+                os.path.join(episode_dir, "encoded_video.pth"),
+                weights_only=False,
+                map_location="cpu"
+            )
+            frame_count = encoded_data['latents'].shape[1]
+            frame_counts.append(frame_count)
+            if frame_count < 10:
+                less_than_10 += 1
+        except Exception as e:
+            continue
+    total_sample = len(frame_counts)
+    percentage_less_than_10 = less_than_10 / total_sample * 100
+    print(f"📊 采样结果:")
+    print(f"  采样数量: {total_sample}")
+    print(f"  < 10帧: {less_than_10} ({percentage_less_than_10:.2f}%)")
+    print(f"  >= 10帧: {total_sample - less_than_10} ({100 - percentage_less_than_10:.2f}%)")
+    print(f"  平均帧数: {sum(frame_counts) / len(frame_counts):.2f}")
+    # 估算全数据集
+    total_episodes = len(episode_dirs)
+    estimated_less_than_10 = int(total_episodes * percentage_less_than_10 / 100)
+    print(f"\n🔮 全数据集估算:")
+    print(f"  总episodes: {total_episodes}")
+    print(f"  估算 < 10帧: {estimated_less_than_10} ({percentage_less_than_10:.2f}%)")
+    print(f"  估算 >= 10帧: {total_episodes - estimated_less_than_10} ({100 - percentage_less_than_10:.2f}%)")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="分析OpenX数据集的帧数分布")
+    parser.add_argument("--dataset_path", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/openx-fractal-encoded",
+                       help="OpenX编码数据集路径")
+    parser.add_argument("--quick", action="store_true", help="快速采样分析模式")
+    parser.add_argument("--sample_size", type=int, default=1000, help="快速模式的采样数量")
+    args = parser.parse_args()
+    if args.quick:
+        quick_sample_analysis(args.dataset_path, args.sample_size)
+    else:
+        analyze_openx_dataset_frame_counts(args.dataset_path)

scripts/analyze_pose.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import os
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+from pose_classifier import PoseClassifier
+import torch
+from collections import defaultdict
+def analyze_turning_patterns_detailed(dataset_path, num_samples=50):
+    """详细分析转弯模式，基于相对于reference的pose变化"""
+    classifier = PoseClassifier()
+    samples_path = os.path.join(dataset_path, "samples")
+    all_analyses = []
+    sample_count = 0
+    # 用于统计每个类别的样本
+    class_samples = defaultdict(list)
+    print("=== 开始分析样本（基于相对于reference的变化）===")
+    for item in sorted(os.listdir(samples_path)):  # 排序以便有序输出
+        if sample_count >= num_samples:
+            break
+        sample_dir = os.path.join(samples_path, item)
+        if os.path.isdir(sample_dir):
+            poses_path = os.path.join(sample_dir, "poses.json")
+            if os.path.exists(poses_path):
+                try:
+                    with open(poses_path, 'r') as f:
+                        poses_data = json.load(f)
+                    target_relative_poses = poses_data['target_relative_poses']
+                    if len(target_relative_poses) > 0:
+                        # 🔧 创建相对pose向量（已经是相对于reference的）
+                        pose_vecs = []
+                        for pose_data in target_relative_poses:
+                            # 相对位移（已经是相对于reference计算的）
+                            translation = torch.tensor(pose_data['relative_translation'], dtype=torch.float32)
+                            # 🔧 相对旋转（需要从current和reference计算）
+                            current_rotation = torch.tensor(pose_data['current_rotation'], dtype=torch.float32)
+                            reference_rotation = torch.tensor(pose_data['reference_rotation'], dtype=torch.float32)
+                            # 计算相对旋转：q_relative = q_ref^-1 * q_current
+                            relative_rotation = calculate_relative_rotation(current_rotation, reference_rotation)
+                            # 组合为7D向量：[relative_translation, relative_rotation]
+                            pose_vec = torch.cat([translation, relative_rotation], dim=0)
+                            pose_vecs.append(pose_vec)
+                        if pose_vecs:
+                            pose_sequence = torch.stack(pose_vecs, dim=0)
+                            # 🔧 使用新的分析方法
+                            analysis = classifier.analyze_pose_sequence(pose_sequence)
+                            analysis['sample_name'] = item
+                            all_analyses.append(analysis)
+                            # 🔧 详细输出每个样本的分类信息
+                            print(f"\n--- 样本 {sample_count + 1}: {item} ---")
+                            print(f"总帧数: {analysis['total_frames']}")
+                            print(f"总距离: {analysis['total_distance']:.4f}")
+                            # 分类分布
+                            class_dist = analysis['class_distribution']
+                            print(f"分类分布:")
+                            for class_name, count in class_dist.items():
+                                percentage = count / analysis['total_frames'] * 100
+                                print(f"  {class_name}: {count} 帧 ({percentage:.1f}%)")
+                            # 🔧 调试前几个pose的分类过程
+                            print(f"前3帧的详细分类过程:")
+                            for i in range(min(3, len(pose_vecs))):
+                                debug_info = classifier.debug_single_pose(
+                                    pose_vecs[i][:3], pose_vecs[i][3:7]
+                                )
+                                print(f"  帧{i}: {debug_info['classification']} "
+                                      f"(yaw: {debug_info['yaw_angle_deg']:.2f}°, "
+                                      f"forward: {debug_info['forward_movement']:.3f})")
+                            # 运动段落
+                            print(f"运动段落:")
+                            for i, segment in enumerate(analysis['motion_segments']):
+                                print(f"  段落{i+1}: {segment['class']} (帧 {segment['start_frame']}-{segment['end_frame']}, 持续 {segment['duration']} 帧)")
+                            # 🔧 确定主要运动类型
+                            dominant_class = max(class_dist.items(), key=lambda x: x[1])
+                            dominant_class_name = dominant_class[0]
+                            dominant_percentage = dominant_class[1] / analysis['total_frames'] * 100
+                            print(f"主要运动类型: {dominant_class_name} ({dominant_percentage:.1f}%)")
+                            # 将样本添加到对应类别
+                            class_samples[dominant_class_name].append({
+                                'name': item,
+                                'percentage': dominant_percentage,
+                                'analysis': analysis
+                            })
+                            sample_count += 1
+                except Exception as e:
+                    print(f"❌ 处理样本 {item} 时出错: {e}")
+    print("\n" + "="*60)
+    print("=== 按类别分组的样本统计（基于相对于reference的变化）===")
+    # 🔧 按类别输出样本列表
+    for class_name in ['forward', 'backward', 'left_turn', 'right_turn']:
+        samples = class_samples[class_name]
+        print(f"\n🔸 {class_name.upper()} 类样本 (共 {len(samples)} 个):")
+        if samples:
+            # 按主要类别占比排序
+            samples.sort(key=lambda x: x['percentage'], reverse=True)
+            for i, sample_info in enumerate(samples, 1):
+                print(f"  {i:2d}. {sample_info['name']} ({sample_info['percentage']:.1f}%)")
+                # 显示详细的段落信息
+                segments = sample_info['analysis']['motion_segments']
+                segment_summary = []
+                for seg in segments:
+                    if seg['duration'] >= 2:  # 只显示持续时间>=2帧的段落
+                        segment_summary.append(f"{seg['class']}({seg['duration']})")
+                if segment_summary:
+                    print(f"      段落: {' -> '.join(segment_summary)}")
+        else:
+            print("  (无样本)")
+    # 🔧 统计总体模式
+    print(f"\n" + "="*60)
+    print("=== 总体统计 ===")
+    total_forward = sum(a['class_distribution']['forward'] for a in all_analyses)
+    total_backward = sum(a['class_distribution']['backward'] for a in all_analyses)
+    total_left_turn = sum(a['class_distribution']['left_turn'] for a in all_analyses)
+    total_right_turn = sum(a['class_distribution']['right_turn'] for a in all_analyses)
+    total_frames = total_forward + total_backward + total_left_turn + total_right_turn
+    print(f"总样本数: {len(all_analyses)}")
+    print(f"总帧数: {total_frames}")
+    print(f"Forward: {total_forward} 帧 ({total_forward/total_frames*100:.1f}%)")
+    print(f"Backward: {total_backward} 帧 ({total_backward/total_frames*100:.1f}%)")
+    print(f"Left Turn: {total_left_turn} 帧 ({total_left_turn/total_frames*100:.1f}%)")
+    print(f"Right Turn: {total_right_turn} 帧 ({total_right_turn/total_frames*100:.1f}%)")
+    # 🔧 样本分布统计
+    print(f"\n按主要类型的样本分布:")
+    for class_name in ['forward', 'backward', 'left_turn', 'right_turn']:
+        count = len(class_samples[class_name])
+        percentage = count / len(all_analyses) * 100 if all_analyses else 0
+        print(f"  {class_name}: {count} 样本 ({percentage:.1f}%)")
+    return all_analyses, class_samples
+def calculate_relative_rotation(current_rotation, reference_rotation):
+    """计算相对旋转四元数"""
+    q_current = torch.tensor(current_rotation, dtype=torch.float32)
+    q_ref = torch.tensor(reference_rotation, dtype=torch.float32)
+    # 计算参考旋转的逆 (q_ref^-1)
+    q_ref_inv = torch.tensor([q_ref[0], -q_ref[1], -q_ref[2], -q_ref[3]])
+    # 四元数乘法计算相对旋转: q_relative = q_ref^-1 * q_current
+    w1, x1, y1, z1 = q_ref_inv
+    w2, x2, y2, z2 = q_current
+    relative_rotation = torch.tensor([
+        w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2,
+        w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2,
+        w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2,
+        w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2
+    ])
+    return relative_rotation
+if __name__ == "__main__":
+    dataset_path = "/share_zhuyixuan05/zhuyixuan05/nuscenes_video_generation_2"
+    print("开始详细分析pose分类（基于相对于reference的变化）...")
+    all_analyses, class_samples = analyze_turning_patterns_detailed(dataset_path, num_samples=4000)
+    print(f"\n🎉 分析完成! 共处理 {len(all_analyses)} 个样本")

scripts/batch_drone.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import random
+import subprocess
+import time
+src_root = "/share_zhuyixuan05/zhuyixuan05/spatialvid"
+dst_root = "/share_zhuyixuan05/zhuyixuan05/New_spatialvid_drone_first"
+infer_script = "/home/zhuyixuan05/ReCamMaster/infer_origin.py"  # 修改为你的实际路径
+while True:
+    # 随机选择一个子文件夹
+    subdirs = [d for d in os.listdir(src_root) if os.path.isdir(os.path.join(src_root, d))]
+    if not subdirs:
+        print("没有可用的子文件夹")
+        break
+    chosen = random.choice(subdirs)
+    chosen_dir = os.path.join(src_root, chosen)
+    pth_file = os.path.join(chosen_dir, "encoded_video.pth")
+    if not os.path.exists(pth_file):
+        print(f"{pth_file} 不存在，跳过")
+        continue
+    # 生成输出文件名
+    out_file = os.path.join(dst_root, f"{chosen}.mp4")
+    print(f"开始生成: {pth_file} -> {out_file}")
+    # 构造命令
+    cmd = [
+        "python", infer_script,
+        "--condition_pth", pth_file,
+        "--output_path", out_file,
+        "--prompt", "exploring the world",
+        "--modality_type", "sekai",
+        "--direction", "right",
+        "--dit_path", "/share_zhuyixuan05/zhuyixuan05/ICLR2026/framepack_moe/step25000_first.ckpt",
+        "--use_gt_prompt"
+    ]
+    # 仅使用第二张 GPU
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "0"
+    # 执行推理
+    subprocess.run(cmd, env=env)

scripts/batch_infer.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import os
+import subprocess
+import argparse
+from pathlib import Path
+import glob
+def find_video_files(videos_dir):
+    """查找视频目录下的所有视频文件"""
+    video_extensions = ['.mp4']
+    video_files = []
+    for ext in video_extensions:
+        pattern = os.path.join(videos_dir, f"*{ext}")
+        video_files.extend(glob.glob(pattern))
+    return sorted(video_files)
+def run_inference(condition_video, direction, dit_path, output_dir):
+    """运行单个推理任务"""
+    # 构建输出文件名
+    input_filename = os.path.basename(condition_video)
+    name_parts = os.path.splitext(input_filename)
+    output_filename = f"{name_parts[0]}_{direction}{name_parts[1]}"
+    output_path = os.path.join(output_dir, output_filename)
+    # 构建推理命令
+    cmd = [
+        "python", "infer_nus.py",
+        "--condition_video", condition_video,
+        "--direction", direction,
+        "--dit_path", dit_path,
+        "--output_path", output_path,
+    ]
+    print(f"🎬 生成 {direction} 方向视频: {input_filename} -> {output_filename}")
+    print(f"   命令: {' '.join(cmd)}")
+    try:
+        # 运行推理
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        print(f"✅ 成功生成: {output_path}")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"❌ 生成失败: {e}")
+        print(f"   错误输出: {e.stderr}")
+        return False
+def batch_inference(args):
+    """批量推理主函数"""
+    videos_dir = args.videos_dir
+    output_dir = args.output_dir
+    directions = args.directions
+    dit_path = args.dit_path
+    # 检查输入目录
+    if not os.path.exists(videos_dir):
+        print(f"❌ 视频目录不存在: {videos_dir}")
+        return
+    # 创建输出目录
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"📁 输出目录: {output_dir}")
+    # 查找所有视频文件
+    video_files = find_video_files(videos_dir)
+    if not video_files:
+        print(f"❌ 在 {videos_dir} 中没有找到视频文件")
+        return
+    print(f"🎥 找到 {len(video_files)} 个视频文件:")
+    for video in video_files:
+        print(f"   - {os.path.basename(video)}")
+    print(f"🎯 将为每个视频生成以下方向: {', '.join(directions)}")
+    print(f"📊 总共将生成 {len(video_files) * len(directions)} 个视频")
+    # 统计信息
+    total_tasks = len(video_files) * len(directions)
+    completed_tasks = 0
+    failed_tasks = 0
+    # 批量处理
+    for i, video_file in enumerate(video_files, 1):
+        print(f"\n{'='*60}")
+        print(f"处理视频 {i}/{len(video_files)}: {os.path.basename(video_file)}")
+        print(f"{'='*60}")
+        for j, direction in enumerate(directions, 1):
+            print(f"\n--- 方向 {j}/{len(directions)}: {direction} ---")
+            # 检查输出文件是否已存在
+            input_filename = os.path.basename(video_file)
+            name_parts = os.path.splitext(input_filename)
+            output_filename = f"{name_parts[0]}_{direction}{name_parts[1]}"
+            output_path = os.path.join(output_dir, output_filename)
+            if os.path.exists(output_path) and not args.overwrite:
+                print(f"⏭️  文件已存在，跳过: {output_filename}")
+                completed_tasks += 1
+                continue
+            # 运行推理
+            success = run_inference(
+                condition_video=video_file,
+                direction=direction,
+                dit_path=dit_path,
+                output_dir=output_dir,
+            )
+            if success:
+                completed_tasks += 1
+            else:
+                failed_tasks += 1
+            # 显示进度
+            current_progress = completed_tasks + failed_tasks
+            print(f"📈 进度: {current_progress}/{total_tasks} "
+                  f"(成功: {completed_tasks}, 失败: {failed_tasks})")
+    # 最终统计
+    print(f"\n{'='*60}")
+    print(f"🎉 批量推理完成!")
+    print(f"📊 总任务数: {total_tasks}")
+    print(f"✅ 成功: {completed_tasks}")
+    print(f"❌ 失败: {failed_tasks}")
+    print(f"📁 输出目录: {output_dir}")
+    if failed_tasks > 0:
+        print(f"⚠️  有 {failed_tasks} 个任务失败，请检查日志")
+    # 列出生成的文件
+    if completed_tasks > 0:
+        print(f"\n📋 生成的文件:")
+        generated_files = glob.glob(os.path.join(output_dir, "*.mp4"))
+        for file_path in sorted(generated_files):
+            print(f"   - {os.path.basename(file_path)}")
+def main():
+    parser = argparse.ArgumentParser(description="批量对nus/videos目录下的所有视频生成不同方向的输出")
+    parser.add_argument("--videos_dir", type=str, default="/home/zhuyixuan05/ReCamMaster/nus/videos/4032",
+                       help="输入视频目录路径")
+    parser.add_argument("--output_dir", type=str, default="nus/infer_results/batch_dynamic_4032_noise",
+                       help="输出视频目录路径")
+    parser.add_argument("--directions", nargs="+",
+                       default=["left_turn", "right_turn"],
+                       choices=["forward", "backward", "left_turn", "right_turn"],
+                       help="要生成的方向列表")
+    parser.add_argument("--dit_path", type=str, default="/home/zhuyixuan05/ReCamMaster/nus_dynamic/step15000_dynamic.ckpt",
+                       help="训练好的DiT模型路径")
+    parser.add_argument("--overwrite", action="store_true",
+                       help="是否覆盖已存在的输出文件")
+    parser.add_argument("--dry_run", action="store_true",
+                       help="只显示将要执行的任务，不实际运行")
+    args = parser.parse_args()
+    if args.dry_run:
+        print("🔍 预览模式 - 只显示任务，不执行")
+        videos_dir = args.videos_dir
+        video_files = find_video_files(videos_dir)
+        print(f"📁 输入目录: {videos_dir}")
+        print(f"📁 输出目录: {args.output_dir}")
+        print(f"🎥 找到视频: {len(video_files)} 个")
+        print(f"🎯 生成方向: {', '.join(args.directions)}")
+        print(f"📊 总任务数: {len(video_files) * len(args.directions)}")
+        print(f"\n将要执行的任务:")
+        for video in video_files:
+            for direction in args.directions:
+                input_name = os.path.basename(video)
+                name_parts = os.path.splitext(input_name)
+                output_name = f"{name_parts[0]}_{direction}{name_parts[1]}"
+                print(f"   {input_name} -> {output_name} ({direction})")
+    else:
+        batch_inference(args)
+if __name__ == "__main__":
+    main()

scripts/batch_nus.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import random
+import subprocess
+import time
+src_root = "/share_zhuyixuan05/zhuyixuan05/nuscenes_video_generation_dynamic/scenes"
+dst_root = "/share_zhuyixuan05/zhuyixuan05/New_nus_right_2"
+infer_script = "/home/zhuyixuan05/ReCamMaster/infer_moe.py"  # 修改为你的实际路径
+while True:
+    # 随机选择一个子文件夹
+    subdirs = [d for d in os.listdir(src_root) if os.path.isdir(os.path.join(src_root, d))]
+    if not subdirs:
+        print("没有可用的子文件夹")
+        break
+    chosen = random.choice(subdirs)
+    chosen_dir = os.path.join(src_root, chosen)
+    pth_file = os.path.join(chosen_dir, "encoded_video-480p.pth")
+    if not os.path.exists(pth_file):
+        print(f"{pth_file} 不存在，跳过")
+        continue
+    # 生成输出文件名
+    out_file = os.path.join(dst_root, f"{chosen}.mp4")
+    print(f"开始生成: {pth_file} -> {out_file}")
+    # 构造命令
+    cmd = [
+        "python", infer_script,
+        "--condition_pth", pth_file,
+        "--output_path", out_file,
+        "--prompt", "a car is driving",
+        "--modality_type", "nuscenes",
+        "--dit_path", "/share_zhuyixuan05/zhuyixuan05/ICLR2026/framepack_moe/step175000_origin_other_continue3.ckpt"
+    ]
+    # 仅使用第二张 GPU
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "1"
+    # 执行推理
+    subprocess.run(cmd, env=env)

scripts/batch_rt.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import random
+import subprocess
+import time
+src_root = "/share_zhuyixuan05/zhuyixuan05/openx-fractal-encoded"
+dst_root = "/share_zhuyixuan05/zhuyixuan05/New_RT"
+infer_script = "/home/zhuyixuan05/ReCamMaster/infer_moe.py"  # 修改为你的实际路径
+while True:
+    # 随机选择一个子文件夹
+    subdirs = [d for d in os.listdir(src_root) if os.path.isdir(os.path.join(src_root, d))]
+    if not subdirs:
+        print("没有可用的子文件夹")
+        break
+    chosen = random.choice(subdirs)
+    chosen_dir = os.path.join(src_root, chosen)
+    pth_file = os.path.join(chosen_dir, "encoded_video.pth")
+    if not os.path.exists(pth_file):
+        print(f"{pth_file} 不存在，跳过")
+        continue
+    # 生成输出文件名
+    out_file = os.path.join(dst_root, f"{chosen}.mp4")
+    print(f"开始生成: {pth_file} -> {out_file}")
+    # 构造命令
+    cmd = [
+        "python", infer_script,
+        "--condition_pth", pth_file,
+        "--output_path", out_file,
+        "--prompt", "A robotic arm is moving the object",
+        "--modality_type", "openx",
+    ]
+    # 仅使用第二张 GPU
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "1"
+    # 执行推理
+    subprocess.run(cmd, env=env)

scripts/batch_spa.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import random
+import subprocess
+import time
+src_root = "/share_zhuyixuan05/zhuyixuan05/spatialvid"
+dst_root = "/share_zhuyixuan05/zhuyixuan05/New_spatialvid_right"
+infer_script = "/home/zhuyixuan05/ReCamMaster/infer_moe.py"  # 修改为你的实际路径
+while True:
+    # 随机选择一个子文件夹
+    subdirs = [d for d in os.listdir(src_root) if os.path.isdir(os.path.join(src_root, d))]
+    if not subdirs:
+        print("没有可用的子文件夹")
+        break
+    chosen = random.choice(subdirs)
+    chosen_dir = os.path.join(src_root, chosen)
+    pth_file = os.path.join(chosen_dir, "encoded_video.pth")
+    if not os.path.exists(pth_file):
+        print(f"{pth_file} 不存在，跳过")
+        continue
+    # 生成输出文件名
+    out_file = os.path.join(dst_root, f"{chosen}.mp4")
+    print(f"开始生成: {pth_file} -> {out_file}")
+    # 构造命令
+    cmd = [
+        "python", infer_script,
+        "--condition_pth", pth_file,
+        "--output_path", out_file,
+        "--prompt", "exploring the world",
+        "--modality_type", "sekai",
+        #"--direction", "left",
+        "--dit_path", "/share_zhuyixuan05/zhuyixuan05/ICLR2026/framepack_moe/step175000_origin_other_continue3.ckpt"
+    ]
+    # 仅使用第二张 GPU
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "0"
+    # 执行推理
+    subprocess.run(cmd, env=env)

scripts/batch_walk.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import random
+import subprocess
+import time
+src_root = "/share_zhuyixuan05/zhuyixuan05/nuscenes_video_generation_dynamic/scenes"
+dst_root = "/share_zhuyixuan05/zhuyixuan05/New_walk"
+infer_script = "/home/zhuyixuan05/ReCamMaster/infer_moe.py"  # 修改为你的实际路径
+while True:
+    # 随机选择一个子文件夹
+    subdirs = [d for d in os.listdir(src_root) if os.path.isdir(os.path.join(src_root, d))]
+    if not subdirs:
+        print("没有可用的子文件夹")
+        break
+    chosen = random.choice(subdirs)
+    chosen_dir = os.path.join(src_root, chosen)
+    pth_file = os.path.join(chosen_dir, "encoded_video-480p.pth")
+    if not os.path.exists(pth_file):
+        print(f"{pth_file} 不存在，跳过")
+        continue
+    # 生成输出文件名
+    out_file = os.path.join(dst_root, f"{chosen}.mp4")
+    print(f"开始生成: {pth_file} -> {out_file}")
+    # 构造命令
+    cmd = [
+        "python", infer_script,
+        "--condition_pth", pth_file,
+        "--output_path", out_file,
+        "--prompt", "a car is driving",
+        "--modality_type", "nuscenes",
+        "--dit_path", "/share_zhuyixuan05/zhuyixuan05/ICLR2026/framepack_moe/step175000_origin_other_continue3.ckpt"
+    ]
+    # 仅使用第二张 GPU
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "1"
+    # 执行推理
+    subprocess.run(cmd, env=env)

scripts/check.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import torch
+import os
+import argparse
+from collections import defaultdict
+import time
+def load_checkpoint(ckpt_path):
+    """加载检查点文件"""
+    if not os.path.exists(ckpt_path):
+        return None
+    try:
+        state_dict = torch.load(ckpt_path, map_location='cpu')
+        return state_dict
+    except Exception as e:
+        print(f"❌ 加载检查点失败: {e}")
+        return None
+def compare_parameters(state_dict1, state_dict2, threshold=1e-8):
+    """比较两个状态字典的参数差异"""
+    if state_dict1 is None or state_dict2 is None:
+        return None
+    updated_params = {}
+    unchanged_params = {}
+    for name, param1 in state_dict1.items():
+        if name in state_dict2:
+            param2 = state_dict2[name]
+            # 计算参数差异
+            diff = torch.abs(param1 - param2)
+            max_diff = torch.max(diff).item()
+            mean_diff = torch.mean(diff).item()
+            if max_diff > threshold:
+                updated_params[name] = {
+                    'max_diff': max_diff,
+                    'mean_diff': mean_diff,
+                    'shape': param1.shape
+                }
+            else:
+                unchanged_params[name] = {
+                    'max_diff': max_diff,
+                    'mean_diff': mean_diff,
+                    'shape': param1.shape
+                }
+    return updated_params, unchanged_params
+def categorize_parameters(param_dict):
+    """将参数按类型分类"""
+    categories = {
+        'moe_related': {},
+        'camera_related': {},
+        'framepack_related': {},
+        'attention': {},
+        'other': {}
+    }
+    for name, info in param_dict.items():
+        if any(keyword in name.lower() for keyword in ['moe', 'gate', 'expert', 'processor']):
+            categories['moe_related'][name] = info
+        elif any(keyword in name.lower() for keyword in ['cam_encoder', 'projector', 'camera']):
+            categories['camera_related'][name] = info
+        elif any(keyword in name.lower() for keyword in ['clean_x_embedder', 'framepack']):
+            categories['framepack_related'][name] = info
+        elif any(keyword in name.lower() for keyword in ['attn', 'attention']):
+            categories['attention'][name] = info
+        else:
+            categories['other'][name] = info
+    return categories
+def print_category_summary(category_name, params, color_code=''):
+    """打印某类参数的摘要"""
+    if not params:
+        print(f"{color_code}  {category_name}: 无参数")
+        return
+    total_params = len(params)
+    max_diffs = [info['max_diff'] for info in params.values()]
+    mean_diffs = [info['mean_diff'] for info in params.values()]
+    print(f"{color_code}  {category_name} ({total_params} 个参数):")
+    print(f"    最大差异范围: {min(max_diffs):.2e} ~ {max(max_diffs):.2e}")
+    print(f"    平均差异范围: {min(mean_diffs):.2e} ~ {max(mean_diffs):.2e}")
+    # 显示前5个最大变化的参数
+    sorted_params = sorted(params.items(), key=lambda x: x[1]['max_diff'], reverse=True)
+    print(f"    变化最大的参数:")
+    for i, (name, info) in enumerate(sorted_params[:100]):
+        shape_str = 'x'.join(map(str, info['shape']))
+        print(f"      {i+1}. {name} [{shape_str}]: max_diff={info['max_diff']:.2e}")
+def monitor_training(checkpoint_dir, check_interval=60):
+    """监控训练过程中的参数更新"""
+    print(f"🔍 开始监控训练进度...")
+    print(f"📁 检查点目录: {checkpoint_dir}")
+    print(f"⏰ 检查间隔: {check_interval}秒")
+    print("=" * 80)
+    previous_ckpt = None
+    previous_step = -1
+    while True:
+        try:
+            # 查找最新的检查点
+            if not os.path.exists(checkpoint_dir):
+                print(f"❌ 检查点目录不存在: {checkpoint_dir}")
+                time.sleep(check_interval)
+                continue
+            ckpt_files = [f for f in os.listdir(checkpoint_dir) if f.startswith('step') and f.endswith('.ckpt')]
+            if not ckpt_files:
+                print("⏳ 未找到检查点文件，等待中...")
+                time.sleep(check_interval)
+                continue
+            # 按步数排序，获取最新的
+            ckpt_files.sort(key=lambda x: int(x.replace('step', '').replace('.ckpt', '')))
+            latest_ckpt_file = ckpt_files[-1]
+            latest_ckpt_path = os.path.join(checkpoint_dir, latest_ckpt_file)
+            # 提取步数
+            current_step = int(latest_ckpt_file.replace('step', '').replace('.ckpt', ''))
+            if current_step <= previous_step:
+                print(f"⏳ 等待新的检查点... (当前: step{current_step})")
+                time.sleep(check_interval)
+                continue
+            print(f"\n🔍 发现新检查点: {latest_ckpt_file}")
+            # 加载当前检查点
+            current_state_dict = load_checkpoint(latest_ckpt_path)
+            if current_state_dict is None:
+                print("❌ 无法加载当前检查点")
+                time.sleep(check_interval)
+                continue
+            if previous_ckpt is not None:
+                print(f"📊 比较 step{previous_step} -> step{current_step}")
+                # 比较参数
+                updated_params, unchanged_params = compare_parameters(
+                    previous_ckpt, current_state_dict, threshold=1e-8
+                )
+                if updated_params is None:
+                    print("❌ 参数比较失败")
+                else:
+                    # 分类显示结果
+                    updated_categories = categorize_parameters(updated_params)
+                    unchanged_categories = categorize_parameters(unchanged_params)
+                    print(f"\n✅ 已更新的参数 (总共 {len(updated_params)} 个):")
+                    print_category_summary("MoE相关", updated_categories['moe_related'], '🔥')
+                    print_category_summary("Camera相关", updated_categories['camera_related'], '📷')
+                    print_category_summary("FramePack相关", updated_categories['framepack_related'], '🎞️')
+                    print_category_summary("注意力相关", updated_categories['attention'], '👁️')
+                    print_category_summary("其他", updated_categories['other'], '📦')
+                    print(f"\n⚠️  未更新的参数 (总共 {len(unchanged_params)} 个):")
+                    print_category_summary("MoE相关", unchanged_categories['moe_related'], '❄️')
+                    print_category_summary("Camera相关", unchanged_categories['camera_related'], '❄️')
+                    print_category_summary("FramePack相关", unchanged_categories['framepack_related'], '❄️')
+                    print_category_summary("注意力相关", unchanged_categories['attention'], '❄️')
+                    print_category_summary("其他", unchanged_categories['other'], '❄️')
+                    # 检查关键组件是否在更新
+                    critical_keywords = ['moe', 'cam_encoder', 'projector', 'clean_x_embedder']
+                    critical_updated = any(
+                        any(keyword in name.lower() for keyword in critical_keywords)
+                        for name in updated_params.keys()
+                    )
+                    if critical_updated:
+                        print("\n✅ 关键组件正在更新！")
+                    else:
+                        print("\n❌ 警告：关键组件可能未在更新！")
+                    # 计算更新率
+                    total_params = len(updated_params) + len(unchanged_params)
+                    update_rate = len(updated_params) / total_params * 100
+                    print(f"\n📈 参数更新率: {update_rate:.1f}% ({len(updated_params)}/{total_params})")
+            # 保存当前状态用于下次比较
+            previous_ckpt = current_state_dict
+            previous_step = current_step
+            print("=" * 80)
+            time.sleep(check_interval)
+        except KeyboardInterrupt:
+            print("\n👋 监控已停止")
+            break
+        except Exception as e:
+            print(f"❌ 监控过程中出错: {e}")
+            time.sleep(check_interval)
+def compare_two_checkpoints(ckpt1_path, ckpt2_path):
+    """比较两个特定的检查点"""
+    print(f"🔍 比较两个检查点:")
+    print(f"  检查点1: {ckpt1_path}")
+    print(f"  检查点2: {ckpt2_path}")
+    print("=" * 80)
+    # 加载检查点
+    state_dict1 = load_checkpoint(ckpt1_path)
+    state_dict2 = load_checkpoint(ckpt2_path)
+    if state_dict1 is None or state_dict2 is None:
+        print("❌ 无法加载检查点文件")
+        return
+    # 比较参数
+    updated_params, unchanged_params = compare_parameters(state_dict1, state_dict2)
+    if updated_params is None:
+        print("❌ 参数比较失败")
+        return
+    # 分类显示结果
+    updated_categories = categorize_parameters(updated_params)
+    unchanged_categories = categorize_parameters(unchanged_params)
+    print(f"\n✅ 已更新的参数 (总共 {len(updated_params)} 个):")
+    for category_name, params in updated_categories.items():
+        print_category_summary(category_name.replace('_', ' ').title(), params, '🔥')
+    print(f"\n⚠️  未更新的参数 (总共 {len(unchanged_params)} 个):")
+    for category_name, params in unchanged_categories.items():
+        print_category_summary(category_name.replace('_', ' ').title(), params, '❄️')
+    # 计算更新率
+    total_params = len(updated_params) + len(unchanged_params)
+    update_rate = len(updated_params) / total_params * 100
+    print(f"\n📈 参数更新率: {update_rate:.1f}% ({len(updated_params)}/{total_params})")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="检查模型参数更新情况")
+    parser.add_argument("--checkpoint_dir", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/ICLR2026/framepack_moe",
+                       help="检查点目录路径")
+    parser.add_argument("--compare", default=True,
+                       help="比较两个特定检查点，而不是监控")
+    parser.add_argument("--ckpt1", type=str, default="/share_zhuyixuan05/zhuyixuan05/ICLR2026/framepack_moe/step1500_origin_cam_4.ckpt")
+    parser.add_argument("--ckpt2", type=str, default="/share_zhuyixuan05/zhuyixuan05/ICLR2026/framepack_moe/step500_origin_cam_4.ckpt")
+    parser.add_argument("--interval", type=int, default=60,
+                       help="监控检查间隔（秒）")
+    parser.add_argument("--threshold", type=float, default=1e-8,
+                       help="参数变化阈值")
+    args = parser.parse_args()
+    if args.compare:
+        if not args.ckpt1 or not args.ckpt2:
+            print("❌ 比较模式需要指定 --ckpt1 和 --ckpt2")
+        else:
+            compare_two_checkpoints(args.ckpt1, args.ckpt2)
+    else:
+        monitor_training(args.checkpoint_dir, args.interval)

scripts/decode_openx.py ADDED Viewed

	@@ -0,0 +1,428 @@

+import os
+import torch
+import numpy as np
+from PIL import Image
+import imageio
+import argparse
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+from tqdm import tqdm
+import json
+class VideoDecoder:
+    def __init__(self, vae_path, device="cuda"):
+        """初始化视频解码器"""
+        self.device = device
+        # 初始化模型管理器
+        model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+        model_manager.load_models([vae_path])
+        # 创建pipeline并只保留VAE
+        self.pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager)
+        self.pipe = self.pipe.to(device)
+        # 🔧 关键修复：确保VAE及其所有组件都在正确设备上
+        self.pipe.vae = self.pipe.vae.to(device)
+        if hasattr(self.pipe.vae, 'model'):
+            self.pipe.vae.model = self.pipe.vae.model.to(device)
+        print(f"✅ VAE解码器初始化完成，设备: {device}")
+    def decode_latents_to_video(self, latents, output_path, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        """
+        将latents解码为视频 - 修正版本，修复维度处理问题
+        """
+        print(f"🔧 开始解码latents...")
+        print(f"输入latents形状: {latents.shape}")
+        print(f"输入latents设备: {latents.device}")
+        print(f"输入latents数据类型: {latents.dtype}")
+        # 确保latents有batch维度
+        if len(latents.shape) == 4:  # [C, T, H, W]
+            latents = latents.unsqueeze(0)  # -> [1, C, T, H, W]
+        # 🔧 关键修正：确保latents在正确的设备上且数据类型匹配
+        model_dtype = next(self.pipe.vae.parameters()).dtype
+        model_device = next(self.pipe.vae.parameters()).device
+        print(f"模型设备: {model_device}")
+        print(f"模型数据类型: {model_dtype}")
+        # 将latents移动到正确的设备和数据类型
+        latents = latents.to(device=model_device, dtype=model_dtype)
+        print(f"解码latents形状: {latents.shape}")
+        print(f"解码latents设备: {latents.device}")
+        print(f"解码latents数据类型: {latents.dtype}")
+        # 🔧 强制设置pipeline设备，确保所有操作在同一设备上
+        self.pipe.device = model_device
+        # 使用VAE解码
+        with torch.no_grad():
+            try:
+                if tiled:
+                    print("🔧 尝试tiled解码...")
+                    decoded_video = self.pipe.decode_video(
+                        latents,
+                        tiled=True,
+                        tile_size=tile_size,
+                        tile_stride=tile_stride
+                    )
+                else:
+                    print("🔧 使用非tiled解码...")
+                    decoded_video = self.pipe.decode_video(latents, tiled=False)
+            except Exception as e:
+                print(f"decode_video失败，错误: {e}")
+                import traceback
+                traceback.print_exc()
+                # 🔧 fallback: 尝试直接调用VAE
+                try:
+                    print("🔧 尝试直接调用VAE解码...")
+                    decoded_video = self.pipe.vae.decode(
+                        latents.squeeze(0),  # 移除batch维度 [C, T, H, W]
+                        device=model_device,
+                        tiled=False
+                    )
+                    # 手动调整维度: VAE输出 [T, H, W, C] -> [1, T, H, W, C]
+                    if len(decoded_video.shape) == 4:  # [T, H, W, C]
+                        decoded_video = decoded_video.unsqueeze(0)  # -> [1, T, H, W, C]
+                except Exception as e2:
+                    print(f"直接VAE解码也失败: {e2}")
+                    raise e2
+        print(f"解码后视频形状: {decoded_video.shape}")
+        # 🔧 关键修正：正确处理维度顺序
+        video_np = None
+        if len(decoded_video.shape) == 5:
+            # 检查不同的可能维度顺序
+            if decoded_video.shape == torch.Size([1, 3, 113, 480, 832]):
+                # 格式: [B, C, T, H, W] -> 需要转换为 [T, H, W, C]
+                print("🔧 检测到格式: [B, C, T, H, W]")
+                video_np = decoded_video[0].permute(1, 2, 3, 0).to(torch.float32).cpu().numpy()  # [T, H, W, C]
+            elif decoded_video.shape[1] == 3:
+                # 如果第二个维度是3，可能是 [B, C, T, H, W]
+                print("🔧 检测到可能的格式: [B, C, T, H, W]")
+                video_np = decoded_video[0].permute(1, 2, 3, 0).to(torch.float32).cpu().numpy()  # [T, H, W, C]
+            elif decoded_video.shape[-1] == 3:
+                # 如果最后一个维度是3，可能是 [B, T, H, W, C]
+                print("🔧 检测到格式: [B, T, H, W, C]")
+                video_np = decoded_video[0].to(torch.float32).cpu().numpy()  # [T, H, W, C]
+            else:
+                # 尝试找到维度为3的位置
+                shape = list(decoded_video.shape)
+                if 3 in shape:
+                    channel_dim = shape.index(3)
+                    print(f"🔧 检测到通道维度在位置: {channel_dim}")
+                    if channel_dim == 1:  # [B, C, T, H, W]
+                        video_np = decoded_video[0].permute(1, 2, 3, 0).to(torch.float32).cpu().numpy()
+                    elif channel_dim == 4:  # [B, T, H, W, C]
+                        video_np = decoded_video[0].to(torch.float32).cpu().numpy()
+                    else:
+                        print(f"⚠️ 未知的通道维度位置: {channel_dim}")
+                        raise ValueError(f"Cannot handle channel dimension at position {channel_dim}")
+                else:
+                    print(f"⚠️ 未找到通道维度为3的位置，形状: {decoded_video.shape}")
+                    raise ValueError(f"Cannot find channel dimension of size 3 in shape {decoded_video.shape}")
+        elif len(decoded_video.shape) == 4:
+            # 4维张量，检查可能的格式
+            if decoded_video.shape[-1] == 3:  # [T, H, W, C]
+                video_np = decoded_video.to(torch.float32).cpu().numpy()
+            elif decoded_video.shape[0] == 3:  # [C, T, H, W]
+                video_np = decoded_video.permute(1, 2, 3, 0).to(torch.float32).cpu().numpy()
+            else:
+                print(f"⚠️ 无法处理的4D视频形状: {decoded_video.shape}")
+                raise ValueError(f"Cannot handle 4D video tensor shape: {decoded_video.shape}")
+        else:
+            print(f"⚠️ 意外的视频维度数: {len(decoded_video.shape)}")
+            raise ValueError(f"Unexpected video tensor dimensions: {decoded_video.shape}")
+        if video_np is None:
+            raise ValueError("Failed to convert video tensor to numpy array")
+        print(f"转换后视频数组形状: {video_np.shape}")
+        # 🔧 验证最终形状
+        if len(video_np.shape) != 4:
+            raise ValueError(f"Expected 4D array [T, H, W, C], got {video_np.shape}")
+        if video_np.shape[-1] != 3:
+            print(f"⚠️ 通道数异常: 期望3，实际{video_np.shape[-1]}")
+            print(f"完整形状: {video_np.shape}")
+            # 尝试其他维度排列
+            if video_np.shape[0] == 3:  # [C, T, H, W]
+                print("🔧 尝试重新排列: [C, T, H, W] -> [T, H, W, C]")
+                video_np = np.transpose(video_np, (1, 2, 3, 0))
+            elif video_np.shape[1] == 3:  # [T, C, H, W]
+                print("🔧 尝试重新排列: [T, C, H, W] -> [T, H, W, C]")
+                video_np = np.transpose(video_np, (0, 2, 3, 1))
+            else:
+                raise ValueError(f"Expected 3 channels (RGB), got {video_np.shape[-1]} channels")
+        # 反归一化
+        video_np = (video_np * 0.5 + 0.5).clip(0, 1)  # 反归一化
+        video_np = (video_np * 255).astype(np.uint8)
+        print(f"最终视频数组形状: {video_np.shape}")
+        print(f"视频数组值范围: {video_np.min()} - {video_np.max()}")
+        # 保存视频
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        try:
+            with imageio.get_writer(output_path, fps=10, quality=8) as writer:
+                for frame_idx, frame in enumerate(video_np):
+                    # 🔧 验证每一帧的形状
+                    if len(frame.shape) != 3 or frame.shape[-1] != 3:
+                        print(f"⚠️ 帧 {frame_idx} 形状异常: {frame.shape}")
+                        continue
+                    writer.append_data(frame)
+                    if frame_idx % 10 == 0:
+                        print(f"  写入帧 {frame_idx}/{len(video_np)}")
+        except Exception as e:
+            print(f"保存视频失败: {e}")
+            # 🔧 尝试保存前几帧为图片进行调试
+            debug_dir = os.path.join(os.path.dirname(output_path), "debug_frames")
+            os.makedirs(debug_dir, exist_ok=True)
+            for i in range(min(5, len(video_np))):
+                frame = video_np[i]
+                debug_path = os.path.join(debug_dir, f"debug_frame_{i}.png")
+                try:
+                    if len(frame.shape) == 3 and frame.shape[-1] == 3:
+                        Image.fromarray(frame).save(debug_path)
+                        print(f"调试: 保存帧 {i} 到 {debug_path}")
+                    else:
+                        print(f"调试: 帧 {i} 形状异常: {frame.shape}")
+                except Exception as e2:
+                    print(f"调试: 保存帧 {i} 失败: {e2}")
+            raise e
+        print(f"✅ 视频保存到: {output_path}")
+        return video_np
+    def save_frames_as_images(self, video_np, output_dir, prefix="frame"):
+        """将视频帧保存为单独的图像文件"""
+        os.makedirs(output_dir, exist_ok=True)
+        for i, frame in enumerate(video_np):
+            frame_path = os.path.join(output_dir, f"{prefix}_{i:04d}.png")
+            # 🔧 验证帧形状
+            if len(frame.shape) == 3 and frame.shape[-1] == 3:
+                Image.fromarray(frame).save(frame_path)
+            else:
+                print(f"⚠️ 跳过形状异常的帧 {i}: {frame.shape}")
+        print(f"✅ 保存了 {len(video_np)} 帧到: {output_dir}")
+def decode_single_episode(encoded_pth_path, vae_path, output_base_dir, device="cuda"):
+    """解码单个episode的编码数据 - 修正版本"""
+    print(f"\n🔧 解码episode: {encoded_pth_path}")
+    # 加载编码数据
+    try:
+        encoded_data = torch.load(encoded_pth_path, weights_only=False, map_location="cpu")
+        print(f"✅ 成功加载编码数据")
+    except Exception as e:
+        print(f"❌ 加载编码数据失败: {e}")
+        return False
+    # 检查数据结构
+    print("🔍 编码数据结构:")
+    for key, value in encoded_data.items():
+        if isinstance(value, torch.Tensor):
+            print(f"  - {key}: {value.shape}, dtype: {value.dtype}, device: {value.device}")
+        elif isinstance(value, dict):
+            print(f"  - {key}: dict with keys {list(value.keys())}")
+        else:
+            print(f"  - {key}: {type(value)}")
+    # 获取latents
+    latents = encoded_data.get('latents')
+    if latents is None:
+        print("❌ 未找到latents数据")
+        return False
+    # 🔧 确保latents在CPU上（加载时的默认状态）
+    if latents.device != torch.device('cpu'):
+        latents = latents.cpu()
+        print(f"🔧 将latents移动到CPU: {latents.device}")
+    episode_info = encoded_data.get('episode_info', {})
+    episode_idx = episode_info.get('episode_idx', 'unknown')
+    total_frames = episode_info.get('total_frames', latents.shape[1] * 4)  # 估算原始帧数
+    print(f"Episode信息:")
+    print(f"  - Episode索引: {episode_idx}")
+    print(f"  - Latents形状: {latents.shape}")
+    print(f"  - Latents设备: {latents.device}")
+    print(f"  - Latents数据类型: {latents.dtype}")
+    print(f"  - 原始总帧数: {total_frames}")
+    print(f"  - 压缩后帧数: {latents.shape[1]}")
+    # 创建输出目录
+    episode_name = f"episode_{episode_idx:06d}" if isinstance(episode_idx, int) else f"episode_{episode_idx}"
+    output_dir = os.path.join(output_base_dir, episode_name)
+    os.makedirs(output_dir, exist_ok=True)
+    # 初始化解码器
+    try:
+        decoder = VideoDecoder(vae_path, device)
+    except Exception as e:
+        print(f"❌ 初始化解码器失败: {e}")
+        return False
+    # 解码为视频
+    video_output_path = os.path.join(output_dir, "decoded_video.mp4")
+    try:
+        video_np = decoder.decode_latents_to_video(
+            latents,
+            video_output_path,
+            tiled=False,  # 🔧 首先尝试非tiled解码，避免tiled的复杂性
+            tile_size=(34, 34),
+            tile_stride=(18, 16)
+        )
+        # 保存前几帧为图像（用于快速检查）
+        frames_dir = os.path.join(output_dir, "frames")
+        sample_frames = video_np[:min(10, len(video_np))]  # 只保存前10帧
+        decoder.save_frames_as_images(sample_frames, frames_dir, f"frame_{episode_idx}")
+        # 保存解码信息
+        decode_info = {
+            "source_pth": encoded_pth_path,
+            "decoded_video_path": video_output_path,
+            "latents_shape": list(latents.shape),
+            "decoded_video_shape": list(video_np.shape),
+            "original_total_frames": total_frames,
+            "decoded_frames": len(video_np),
+            "compression_ratio": total_frames / len(video_np) if len(video_np) > 0 else 0,
+            "latents_dtype": str(latents.dtype),
+            "latents_device": str(latents.device),
+            "vae_compression_ratio": total_frames / latents.shape[1] if latents.shape[1] > 0 else 0
+        }
+        info_path = os.path.join(output_dir, "decode_info.json")
+        with open(info_path, 'w') as f:
+            json.dump(decode_info, f, indent=2)
+        print(f"✅ Episode {episode_idx} 解码完成")
+        print(f"  - 原始帧数: {total_frames}")
+        print(f"  - 解码帧数: {len(video_np)}")
+        print(f"  - 压缩比: {decode_info['compression_ratio']:.2f}")
+        print(f"  - VAE时间压缩比: {decode_info['vae_compression_ratio']:.2f}")
+        return True
+    except Exception as e:
+        print(f"❌ 解码失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+def batch_decode_episodes(encoded_base_dir, vae_path, output_base_dir, max_episodes=None, device="cuda"):
+    """批量解码episodes"""
+    print(f"🔧 批量解码Open-X episodes")
+    print(f"源目录: {encoded_base_dir}")
+    print(f"输出目录: {output_base_dir}")
+    # 查找所有编码的episodes
+    episode_dirs = []
+    if os.path.exists(encoded_base_dir):
+        for item in sorted(os.listdir(encoded_base_dir)):  # 排序确保一致性
+            episode_dir = os.path.join(encoded_base_dir, item)
+            if os.path.isdir(episode_dir):
+                encoded_path = os.path.join(episode_dir, "encoded_video.pth")
+                if os.path.exists(encoded_path):
+                    episode_dirs.append(encoded_path)
+    print(f"找到 {len(episode_dirs)} 个编码的episodes")
+    if max_episodes and len(episode_dirs) > max_episodes:
+        episode_dirs = episode_dirs[:max_episodes]
+        print(f"限制处理前 {max_episodes} 个episodes")
+    # 批量解码
+    success_count = 0
+    for i, encoded_pth_path in enumerate(tqdm(episode_dirs, desc="解码episodes")):
+        print(f"\n{'='*60}")
+        print(f"处理 {i+1}/{len(episode_dirs)}: {os.path.basename(os.path.dirname(encoded_pth_path))}")
+        success = decode_single_episode(encoded_pth_path, vae_path, output_base_dir, device)
+        if success:
+            success_count += 1
+        print(f"当前成功率: {success_count}/{i+1} ({success_count/(i+1)*100:.1f}%)")
+    print(f"\n🎉 批量解码完成!")
+    print(f"总处理: {len(episode_dirs)} 个episodes")
+    print(f"成功解码: {success_count} 个episodes")
+    print(f"成功率: {success_count/len(episode_dirs)*100:.1f}%")
+def main():
+    parser = argparse.ArgumentParser(description="解码Open-X编码的latents以验证正确性 - 修正版本")
+    parser.add_argument("--mode", type=str, choices=["single", "batch"], default="batch",
+                       help="解码模式：single (单个episode) 或 batch (批量)")
+    parser.add_argument("--encoded_pth", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/openx-fractal-encoded/episode_000000/encoded_video.pth",
+                       help="单个编码文件路径（single模式）")
+    parser.add_argument("--encoded_base_dir", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/openx-fractal-encoded",
+                       help="编码数据基础目录（batch模式）")
+    parser.add_argument("--vae_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+                       help="VAE模型路径")
+    parser.add_argument("--output_dir", type=str,
+                       default="./decoded_results_fixed",
+                       help="解码输出目录")
+    parser.add_argument("--max_episodes", type=int, default=5,
+                       help="最大解码episodes数量（batch模式，用于测试）")
+    parser.add_argument("--device", type=str, default="cuda",
+                       help="计算设备")
+    args = parser.parse_args()
+    print("🔧 Open-X Latents 解码验证工具 (修正版本 - Fixed)")
+    print(f"模式: {args.mode}")
+    print(f"VAE路径: {args.vae_path}")
+    print(f"输出目录: {args.output_dir}")
+    print(f"设备: {args.device}")
+    # 🔧 检查CUDA可用性
+    if args.device == "cuda" and not torch.cuda.is_available():
+        print("⚠️ CUDA不可用，切换到CPU")
+        args.device = "cpu"
+    # 确保输出目录存在
+    os.makedirs(args.output_dir, exist_ok=True)
+    if args.mode == "single":
+        print(f"输入文件: {args.encoded_pth}")
+        if not os.path.exists(args.encoded_pth):
+            print(f"❌ 输入文件不存在: {args.encoded_pth}")
+            return
+        success = decode_single_episode(args.encoded_pth, args.vae_path, args.output_dir, args.device)
+        if success:
+            print("✅ 单个episode解码成功")
+        else:
+            print("❌ 单个episode解码失败")
+    elif args.mode == "batch":
+        print(f"输入目录: {args.encoded_base_dir}")
+        print(f"最大episodes: {args.max_episodes}")
+        if not os.path.exists(args.encoded_base_dir):
+            print(f"❌ 输入目录不存在: {args.encoded_base_dir}")
+            return
+        batch_decode_episodes(args.encoded_base_dir, args.vae_path, args.output_dir, args.max_episodes, args.device)
+if __name__ == "__main__":
+    main()

scripts/download_recam.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id="KwaiVGI/ReCamMaster-Wan2.1",
+    local_dir="models/ReCamMaster/checkpoints",
+    resume_download=True  # 支持断点续传
+)

scripts/download_wan2.1.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from modelscope import snapshot_download
+# Download models
+snapshot_download("Wan-AI/Wan2.1-T2V-1.3B", local_dir="models/Wan-AI/Wan2.1-T2V-1.3B")

scripts/encode_dynamic_videos.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import os
+import torch
+import lightning as pl
+from PIL import Image
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import json
+import imageio
+from torchvision.transforms import v2
+from einops import rearrange
+import argparse
+from tqdm import tqdm
+class VideoEncoder(pl.LightningModule):
+    def __init__(self, text_encoder_path, vae_path, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        super().__init__()
+        model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+        model_manager.load_models([text_encoder_path, vae_path])
+        self.pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager)
+        self.tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        self.frame_process = v2.Compose([
+            # v2.CenterCrop(size=(900, 1600)),
+            # v2.Resize(size=(900, 1600), antialias=True),
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+    def crop_and_resize(self, image):
+        width, height = image.size
+        width_ori, height_ori_ = 832 , 480
+        image = v2.functional.resize(
+            image,
+            (round(height_ori_), round(width_ori)),
+            interpolation=v2.InterpolationMode.BILINEAR
+        )
+        return image
+    def load_video_frames(self, video_path):
+        """加载完整视频"""
+        reader = imageio.get_reader(video_path)
+        frames = []
+        for frame_data in reader:
+            frame = Image.fromarray(frame_data)
+            frame = self.crop_and_resize(frame)
+            frame = self.frame_process(frame)
+            frames.append(frame)
+        reader.close()
+        if len(frames) == 0:
+            return None
+        frames = torch.stack(frames, dim=0)
+        frames = rearrange(frames, "T C H W -> C T H W")
+        return frames
+def encode_scenes(scenes_path, text_encoder_path, vae_path):
+    """编码所有场景的视频"""
+    encoder = VideoEncoder(text_encoder_path, vae_path)
+    encoder = encoder.cuda()
+    encoder.pipe.device = "cuda"
+    processed_count = 0
+    for idx, scene_name in enumerate(tqdm(os.listdir(scenes_path))):
+        if idx < 450:
+            continue
+        scene_dir = os.path.join(scenes_path, scene_name)
+        if not os.path.isdir(scene_dir):
+            continue
+        # 检查是否已编码
+        encoded_path = os.path.join(scene_dir, "encoded_video-480p-1.pth")
+        if os.path.exists(encoded_path):
+            print(f"Scene {scene_name} already encoded, skipping...")
+            continue
+        # 加载场景信息
+        scene_info_path = os.path.join(scene_dir, "scene_info.json")
+        if not os.path.exists(scene_info_path):
+            continue
+        with open(scene_info_path, 'r') as f:
+            scene_info = json.load(f)
+        # 加载视频
+        video_path = os.path.join(scene_dir, scene_info['video_path'])
+        if not os.path.exists(video_path):
+            print(f"Video not found: {video_path}")
+            continue
+        try:
+            print(f"Encoding scene {scene_name}...")
+            # 加载和编码视频
+            video_frames = encoder.load_video_frames(video_path)
+            if video_frames is None:
+                print(f"Failed to load video: {video_path}")
+                continue
+            video_frames = video_frames.unsqueeze(0).to("cuda", dtype=torch.bfloat16)
+            # 编码视频
+            with torch.no_grad():
+                latents = encoder.pipe.encode_video(video_frames, **encoder.tiler_kwargs)[0]
+                # print(latents.shape)
+                # assert False
+                # 编码文本
+                # prompt_emb = encoder.pipe.encode_prompt("A car driving scene captured by front camera")
+                if processed_count == 0:
+                    print('encode prompt!!!')
+                    prompt_emb = encoder.pipe.encode_prompt("A car driving scene captured by front camera")
+                    del encoder.pipe.prompter
+                # 保存编码结果
+                encoded_data = {
+                    "latents": latents.cpu(),
+                    "prompt_emb": {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in prompt_emb.items()},
+                    "image_emb": {}
+                }
+                torch.save(encoded_data, encoded_path)
+                print(f"Saved encoded data: {encoded_path}")
+                processed_count += 1
+        except Exception as e:
+            print(f"Error encoding scene {scene_name}: {e}")
+            continue
+    print(f"Encoding completed! Processed {processed_count} scenes.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--scenes_path", type=str, default="/share_zhuyixuan05/zhuyixuan05/nuscenes_video_generation_dynamic/scenes")
+    parser.add_argument("--text_encoder_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth")
+    parser.add_argument("--vae_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth")
+    args = parser.parse_args()
+    encode_scenes(args.scenes_path, args.text_encoder_path, args.vae_path)

scripts/encode_openx.py ADDED Viewed

	@@ -0,0 +1,466 @@

+import os
+import torch
+import lightning as pl
+from PIL import Image
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import json
+import imageio
+from torchvision.transforms import v2
+from einops import rearrange
+import argparse
+import numpy as np
+from tqdm import tqdm
+# 🔧 关键修复：设置环境变量避免GCS连接
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["TFDS_DISABLE_GCS"] = "1"
+import tensorflow_datasets as tfds
+import tensorflow as tf
+class VideoEncoder(pl.LightningModule):
+    def __init__(self, text_encoder_path, vae_path, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        super().__init__()
+        model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+        model_manager.load_models([text_encoder_path, vae_path])
+        self.pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager)
+        self.tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        self.frame_process = v2.Compose([
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+    def crop_and_resize(self, image, target_width=832, target_height=480):
+        """调整图像尺寸"""
+        image = v2.functional.resize(
+            image,
+            (target_height, target_width),
+            interpolation=v2.InterpolationMode.BILINEAR
+        )
+        return image
+    def load_episode_frames(self, episode_data, max_frames=300):
+        """🔧 从fractal数据集加载视频帧 - 基于实际observation字段优化"""
+        frames = []
+        steps = episode_data['steps']
+        frame_count = 0
+        print(f"开始提取帧，最多 {max_frames} 帧...")
+        for step_idx, step in enumerate(steps):
+            if frame_count >= max_frames:
+                break
+            try:
+                obs = step['observation']
+                # 🔧 基于实际的observation字段，优先使用'image'
+                img_data = None
+                image_keys_to_try = [
+                    'image',                 # ✅ 确认存在的主要图像字段
+                    'rgb',                   # 备用RGB图像
+                    'camera_image',          # 备用相机图像
+                    'exterior_image_1_left', # 可能的外部摄像头
+                    'wrist_image',           # 可能的手腕摄像头
+                ]
+                for img_key in image_keys_to_try:
+                    if img_key in obs:
+                        try:
+                            img_tensor = obs[img_key]
+                            img_data = img_tensor.numpy()
+                            if step_idx < 3:  # 只为前几个步骤打印
+                                print(f"✅ 找到图像字段: {img_key}, 形状: {img_data.shape}")
+                            break
+                        except Exception as e:
+                            if step_idx < 3:
+                                print(f"尝试字段 {img_key} 失败: {e}")
+                            continue
+                if img_data is not None:
+                    # 确保图像数据格式正确
+                    if len(img_data.shape) == 3:  # [H, W, C]
+                        if img_data.dtype == np.uint8:
+                            frame = Image.fromarray(img_data)
+                        else:
+                            # 如果是归一化的浮点数，转换为uint8
+                            if img_data.max() <= 1.0:
+                                img_data = (img_data * 255).astype(np.uint8)
+                            else:
+                                img_data = img_data.astype(np.uint8)
+                            frame = Image.fromarray(img_data)
+                        # 转换为RGB如果需要
+                        if frame.mode != 'RGB':
+                            frame = frame.convert('RGB')
+                        frame = self.crop_and_resize(frame)
+                        frame = self.frame_process(frame)
+                        frames.append(frame)
+                        frame_count += 1
+                        if frame_count % 50 == 0:
+                            print(f"已处理 {frame_count} 帧")
+                    else:
+                        if step_idx < 5:
+                            print(f"步骤 {step_idx}: 图像形状不正确 {img_data.shape}")
+                else:
+                    # 如果找不到图像，打印可用的观测键
+                    if step_idx < 5:  # 只为前几个步骤打印
+                        available_keys = list(obs.keys())
+                        print(f"步骤 {step_idx}: 未找到图像，可用键: {available_keys}")
+            except Exception as e:
+                print(f"处理步骤 {step_idx} 时出错: {e}")
+                continue
+        print(f"成功提取 {len(frames)} 帧")
+        if len(frames) == 0:
+            return None
+        frames = torch.stack(frames, dim=0)
+        frames = rearrange(frames, "T C H W -> C T H W")
+        return frames
+    def extract_camera_poses(self, episode_data, num_frames):
+        """🔧 从fractal数据集提取相机位姿信息 - 基于实际observation和action字段优化"""
+        camera_poses = []
+        steps = episode_data['steps']
+        frame_count = 0
+        print("提取相机位姿信息...")
+        # 🔧 累积位姿信息
+        cumulative_translation = np.array([0.0, 0.0, 0.0], dtype=np.float32)
+        cumulative_rotation = np.array([0.0, 0.0, 0.0], dtype=np.float32)  # 欧拉角
+        for step_idx, step in enumerate(steps):
+            if frame_count >= num_frames:
+                break
+            try:
+                obs = step['observation']
+                action = step.get('action', {})
+                # 🔧 基于实际的字段提取位姿变化
+                pose_data = {}
+                found_pose = False
+                # 1. 优先使用action中的world_vector（世界坐标系中的位移）
+                if 'world_vector' in action:
+                    try:
+                        world_vector = action['world_vector'].numpy()
+                        if len(world_vector) == 3:
+                            # 累积世界坐标位移
+                            cumulative_translation += world_vector
+                            pose_data['translation'] = cumulative_translation.copy()
+                            found_pose = True
+                            if step_idx < 3:
+                                print(f"使用action.world_vector: {world_vector}, 累积位移: {cumulative_translation}")
+                    except Exception as e:
+                        if step_idx < 3:
+                            print(f"action.world_vector提取失败: {e}")
+                # 2. 使用action中的rotation_delta（旋转变化）
+                if 'rotation_delta' in action:
+                    try:
+                        rotation_delta = action['rotation_delta'].numpy()
+                        if len(rotation_delta) == 3:
+                            # 累积旋转变化
+                            cumulative_rotation += rotation_delta
+                            # 转换为四元数（简化版本）
+                            euler_angles = cumulative_rotation
+                            # 欧拉角转四元数（ZYX顺序）
+                            roll, pitch, yaw = euler_angles[0], euler_angles[1], euler_angles[2]
+                            # 简化的欧拉角到四元数转换
+                            cy = np.cos(yaw * 0.5)
+                            sy = np.sin(yaw * 0.5)
+                            cp = np.cos(pitch * 0.5)
+                            sp = np.sin(pitch * 0.5)
+                            cr = np.cos(roll * 0.5)
+                            sr = np.sin(roll * 0.5)
+                            qw = cr * cp * cy + sr * sp * sy
+                            qx = sr * cp * cy - cr * sp * sy
+                            qy = cr * sp * cy + sr * cp * sy
+                            qz = cr * cp * sy - sr * sp * cy
+                            pose_data['rotation'] = np.array([qw, qx, qy, qz], dtype=np.float32)
+                            found_pose = True
+                            if step_idx < 3:
+                                print(f"使用action.rotation_delta: {rotation_delta}, 累积旋转: {cumulative_rotation}")
+                    except Exception as e:
+                        if step_idx < 3:
+                            print(f"action.rotation_delta提取失败: {e}")
+                # 确保rotation字段存在
+                if 'rotation' not in pose_data:
+                    # 使用当前累积的旋转计算四元数
+                    roll, pitch, yaw = cumulative_rotation[0], cumulative_rotation[1], cumulative_rotation[2]
+                    cy = np.cos(yaw * 0.5)
+                    sy = np.sin(yaw * 0.5)
+                    cp = np.cos(pitch * 0.5)
+                    sp = np.sin(pitch * 0.5)
+                    cr = np.cos(roll * 0.5)
+                    sr = np.sin(roll * 0.5)
+                    qw = cr * cp * cy + sr * sp * sy
+                    qx = sr * cp * cy - cr * sp * sy
+                    qy = cr * sp * cy + sr * cp * sy
+                    qz = cr * cp * sy - sr * sp * cy
+                    pose_data['rotation'] = np.array([qw, qx, qy, qz], dtype=np.float32)
+                camera_poses.append(pose_data)
+                frame_count += 1
+            except Exception as e:
+                print(f"提取位姿步骤 {step_idx} 时出错: {e}")
+                # 添加默认位姿
+                pose_data = {
+                    'translation': cumulative_translation.copy(),
+                    'rotation': np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float32)
+                }
+                camera_poses.append(pose_data)
+                frame_count += 1
+        print(f"提取了 {len(camera_poses)} 个位姿")
+        print(f"最终累积位移: {cumulative_translation}")
+        print(f"最终累积旋转: {cumulative_rotation}")
+        return camera_poses
+    def create_camera_matrices(self, camera_poses):
+        """将位姿转换为4x4变换矩阵"""
+        matrices = []
+        for pose in camera_poses:
+            matrix = np.eye(4, dtype=np.float32)
+            # 设置平移
+            matrix[:3, 3] = pose['translation']
+            # 设置旋转 - 假设是四元数 [w, x, y, z]
+            if len(pose['rotation']) == 4:
+                # 四元数转旋转矩阵
+                q = pose['rotation']
+                w, x, y, z = q[0], q[1], q[2], q[3]
+                # 四元数到旋转矩阵的转换
+                matrix[0, 0] = 1 - 2*(y*y + z*z)
+                matrix[0, 1] = 2*(x*y - w*z)
+                matrix[0, 2] = 2*(x*z + w*y)
+                matrix[1, 0] = 2*(x*y + w*z)
+                matrix[1, 1] = 1 - 2*(x*x + z*z)
+                matrix[1, 2] = 2*(y*z - w*x)
+                matrix[2, 0] = 2*(x*z - w*y)
+                matrix[2, 1] = 2*(y*z + w*x)
+                matrix[2, 2] = 1 - 2*(x*x + y*y)
+            elif len(pose['rotation']) == 3:
+                # 欧拉角转换（如果需要）
+                pass
+            matrices.append(matrix)
+        return np.array(matrices)
+def encode_fractal_dataset(dataset_path, text_encoder_path, vae_path, output_dir, max_episodes=None):
+    """🔧 编码fractal20220817_data数据集 - 基于实际字段结构优化"""
+    encoder = VideoEncoder(text_encoder_path, vae_path)
+    encoder = encoder.cuda()
+    encoder.pipe.device = "cuda"
+    os.makedirs(output_dir, exist_ok=True)
+    processed_count = 0
+    prompt_emb = None
+    try:
+        # 🔧 使用你提供的成功方法加载数据集
+        ds = tfds.load(
+            "fractal20220817_data",
+            split="train",
+            data_dir=dataset_path,
+        )
+        print(f"✅ 成功加载fractal20220817_data数据集")
+        # 限制处理的episode数量
+        if max_episodes:
+            ds = ds.take(max_episodes)
+            print(f"限制处理episodes数量: {max_episodes}")
+    except Exception as e:
+        print(f"❌ 加载数据集失败: {e}")
+        return
+    for episode_idx, episode in enumerate(tqdm(ds, desc="处理episodes")):
+        try:
+            episode_name = f"episode_{episode_idx:06d}"
+            save_episode_dir = os.path.join(output_dir, episode_name)
+            # 检查是否已经处理过
+            encoded_path = os.path.join(save_episode_dir, "encoded_video.pth")
+            if os.path.exists(encoded_path):
+                print(f"Episode {episode_name} 已处理，跳过...")
+                processed_count += 1
+                continue
+            os.makedirs(save_episode_dir, exist_ok=True)
+            print(f"\n🔧 处理episode {episode_name}...")
+            # 🔧 分析episode结构（仅对前几个episode）
+            if episode_idx < 2:
+                print("Episode结构分析:")
+                for key in episode.keys():
+                    print(f"  - {key}: {type(episode[key])}")
+                # 分析第一个step的结构
+                steps = episode['steps']
+                for step in steps.take(1):
+                    print("第一个step结构:")
+                    for key in step.keys():
+                        print(f"    - {key}: {type(step[key])}")
+                    if 'observation' in step:
+                        obs = step['observation']
+                        print("  observation键:")
+                        print(f"    🔍 可用字段: {list(obs.keys())}")
+                        # 重点检查图像和位姿相关字段
+                        key_fields = ['image', 'vector_to_go', 'rotation_delta_to_go', 'base_pose_tool_reached']
+                        for key in key_fields:
+                            if key in obs:
+                                try:
+                                    value = obs[key]
+                                    if hasattr(value, 'shape'):
+                                        print(f"      ✅ {key}: {type(value)}, shape: {value.shape}")
+                                    else:
+                                        print(f"      ✅ {key}: {type(value)}")
+                                except Exception as e:
+                                    print(f"      ❌ {key}: 无法访问 ({e})")
+                    if 'action' in step:
+                        action = step['action']
+                        print("  action键:")
+                        print(f"    🔍 可用字段: {list(action.keys())}")
+                        # 重点检查位姿相关字段
+                        key_fields = ['world_vector', 'rotation_delta', 'base_displacement_vector']
+                        for key in key_fields:
+                            if key in action:
+                                try:
+                                    value = action[key]
+                                    if hasattr(value, 'shape'):
+                                        print(f"      ✅ {key}: {type(value)}, shape: {value.shape}")
+                                    else:
+                                        print(f"      ✅ {key}: {type(value)}")
+                                except Exception as e:
+                                    print(f"      ❌ {key}: 无法访问 ({e})")
+            # 加载视频帧
+            video_frames = encoder.load_episode_frames(episode)
+            if video_frames is None:
+                print(f"❌ 无法加载episode {episode_name}的视频帧")
+                continue
+            print(f"✅ Episode {episode_name} 视频形状: {video_frames.shape}")
+            # 提取相机位姿
+            num_frames = video_frames.shape[1]
+            camera_poses = encoder.extract_camera_poses(episode, num_frames)
+            camera_matrices = encoder.create_camera_matrices(camera_poses)
+            print(f"🔧 编码episode {episode_name}...")
+            # 准备相机数据
+            cam_emb = {
+                'extrinsic': camera_matrices,
+                'intrinsic': np.eye(3, dtype=np.float32)
+            }
+            # 编码视频
+            frames_batch = video_frames.unsqueeze(0).to("cuda", dtype=torch.bfloat16)
+            with torch.no_grad():
+                latents = encoder.pipe.encode_video(frames_batch, **encoder.tiler_kwargs)[0]
+                # 编码文本prompt（第一次）
+                if prompt_emb is None:
+                    print('🔧 编码prompt...')
+                    prompt_emb = encoder.pipe.encode_prompt(
+                        "A video of robotic manipulation task with camera movement"
+                    )
+                    # 释放prompter以节省内存
+                    del encoder.pipe.prompter
+                # 保存编码结果
+                encoded_data = {
+                    "latents": latents.cpu(),
+                    "prompt_emb": {k: v.cpu() if isinstance(v, torch.Tensor) else v
+                                 for k, v in prompt_emb.items()},
+                    "cam_emb": cam_emb,
+                    "episode_info": {
+                        "episode_idx": episode_idx,
+                        "total_frames": video_frames.shape[1],
+                        "pose_extraction_method": "observation_action_based"
+                    }
+                }
+                torch.save(encoded_data, encoded_path)
+                print(f"✅ 保存编码数据: {encoded_path}")
+            processed_count += 1
+            print(f"✅ 已处理 {processed_count} 个episodes")
+        except Exception as e:
+            print(f"❌ 处理episode {episode_idx}时出错: {e}")
+            import traceback
+            traceback.print_exc()
+            continue
+    print(f"🎉 编码完成! 总共处理了 {processed_count} 个episodes")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Encode Open-X Fractal20220817 Dataset - Based on Real Structure")
+    parser.add_argument("--dataset_path", type=str,
+                       default="/share_zhuyixuan05/public_datasets/open-x/0.1.0",
+                       help="Path to tensorflow_datasets directory")
+    parser.add_argument("--text_encoder_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth")
+    parser.add_argument("--vae_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth")
+    parser.add_argument("--output_dir", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/openx-fractal-encoded")
+    parser.add_argument("--max_episodes", type=int, default=10000,
+                       help="Maximum number of episodes to process (default: 10 for testing)")
+    args = parser.parse_args()
+    # 确保输出目录存在
+    os.makedirs(args.output_dir, exist_ok=True)
+    print("🚀 开始编码Open-X Fractal数据集 (基于实际字段结构)...")
+    print(f"📁 数据集路径: {args.dataset_path}")
+    print(f"💾 输出目录: {args.output_dir}")
+    print(f"🔢 最大处理episodes: {args.max_episodes}")
+    print("🔧 基于实际observation和action字段的位姿提取方法")
+    print("✅ 优先使用 'image' 字段获取图像数据")
+    encode_fractal_dataset(
+        args.dataset_path,
+        args.text_encoder_path,
+        args.vae_path,
+        args.output_dir,
+        args.max_episodes
+    )

scripts/encode_rlbench_video.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os
+import torch
+import lightning as pl
+from PIL import Image
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import json
+import imageio
+from torchvision.transforms import v2
+from einops import rearrange
+import argparse
+import numpy as np
+import pdb
+from tqdm import tqdm
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+class VideoEncoder(pl.LightningModule):
+    def __init__(self, text_encoder_path, vae_path, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        super().__init__()
+        model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+        model_manager.load_models([text_encoder_path, vae_path])
+        self.pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager)
+        self.tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        self.frame_process = v2.Compose([
+            # v2.CenterCrop(size=(900, 1600)),
+            # v2.Resize(size=(900, 1600), antialias=True),
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+    def crop_and_resize(self, image):
+        width, height = image.size
+        # print(width,height)
+        width_ori, height_ori_ = 512 , 512
+        image = v2.functional.resize(
+            image,
+            (round(height_ori_), round(width_ori)),
+            interpolation=v2.InterpolationMode.BILINEAR
+        )
+        return image
+    def load_video_frames(self, video_path):
+        """加载完整视频"""
+        reader = imageio.get_reader(video_path)
+        frames = []
+        for frame_data in reader:
+            frame = Image.fromarray(frame_data)
+            frame = self.crop_and_resize(frame)
+            frame = self.frame_process(frame)
+            frames.append(frame)
+        reader.close()
+        if len(frames) == 0:
+            return None
+        frames = torch.stack(frames, dim=0)
+        frames = rearrange(frames, "T C H W -> C T H W")
+        return frames
+def encode_scenes(scenes_path, text_encoder_path, vae_path,output_dir):
+    """编码所有场景的视频"""
+    encoder = VideoEncoder(text_encoder_path, vae_path)
+    encoder = encoder.cuda()
+    encoder.pipe.device = "cuda"
+    processed_count = 0
+    prompt_emb = 0
+    os.makedirs(output_dir,exist_ok=True)
+    for i, scene_name in enumerate(os.listdir(scenes_path)):
+        # if i < 1700:
+        #     continue
+        scene_dir = os.path.join(scenes_path, scene_name)
+        for j, demo_name in tqdm(enumerate(os.listdir(scene_dir)),total=len(os.listdir(scene_dir))):
+            demo_dir = os.path.join(scene_dir, demo_name)
+            for filename in os.listdir(demo_dir):
+            # 检查文件是否以.mp4结尾（不区分大小写）
+                if filename.lower().endswith('.mp4'):
+                    # 获取完整路径
+                    full_path = os.path.join(demo_dir, filename)
+            print(full_path)
+            save_dir = os.path.join(output_dir,scene_name+'_'+demo_name)
+        # print('in:',scene_dir)
+        # print('out:',save_dir)
+            os.makedirs(save_dir,exist_ok=True)
+        # 检查是否已编码
+            encoded_path = os.path.join(save_dir, "encoded_video.pth")
+            if os.path.exists(encoded_path):
+                print(f"Scene {scene_name} already encoded, skipping...")
+                continue
+        # 加载场景信息
+            scene_cam_path = full_path.replace("side.mp4", "data.npy")
+            print(scene_cam_path)
+            if not os.path.exists(scene_cam_path):
+                continue
+            # with np.load(scene_cam_path) as data:
+            cam_data = np.load(scene_cam_path)
+            cam_emb = cam_data
+            print(cam_data.shape)
+        # with open(scene_cam_path, 'rb') as f:
+        #     cam_data = np.load(f)  # 此时cam_data仅包含数据，无文件句柄引用
+        # 加载视频
+            video_path = full_path
+            if not os.path.exists(video_path):
+                print(f"Video not found: {video_path}")
+                continue
+        # try:
+            print(f"Encoding scene {scene_name}...Demo {demo_name}")
+        # 加载和编码视频
+            video_frames = encoder.load_video_frames(video_path)
+            if video_frames is None:
+                print(f"Failed to load video: {video_path}")
+                continue
+            video_frames = video_frames.unsqueeze(0).to("cuda", dtype=torch.bfloat16)
+            print('video shape:',video_frames.shape)
+            # 编码视频
+            with torch.no_grad():
+                latents = encoder.pipe.encode_video(video_frames, **encoder.tiler_kwargs)[0]
+                # 编码文本
+                # if processed_count == 0:
+                #     print('encode prompt!!!')
+                #     prompt_emb = encoder.pipe.encode_prompt("A video of a scene shot using a pedestrian's front camera while walking")
+                #     del encoder.pipe.prompter
+                # pdb.set_trace()
+                # 保存编码结果
+                encoded_data = {
+                    "latents": latents.cpu(),
+                    #"prompt_emb": {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in prompt_emb.items()},
+                    "cam_emb": cam_emb
+                }
+                # pdb.set_trace()
+                torch.save(encoded_data, encoded_path)
+                print(f"Saved encoded data: {encoded_path}")
+                processed_count += 1
+        # except Exception as e:
+        #     print(f"Error encoding scene {scene_name}: {e}")
+        #     continue
+    print(f"Encoding completed! Processed {processed_count} scenes.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--scenes_path", type=str, default="/share_zhuyixuan05/zhuyixuan05/RLBench")
+    parser.add_argument("--text_encoder_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth")
+    parser.add_argument("--vae_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth")
+    parser.add_argument("--output_dir",type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/rlbench")
+    args = parser.parse_args()
+    encode_scenes(args.scenes_path, args.text_encoder_path, args.vae_path,args.output_dir)

scripts/encode_sekai_video.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import os
+import torch
+import lightning as pl
+from PIL import Image
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import json
+import imageio
+from torchvision.transforms import v2
+from einops import rearrange
+import argparse
+import numpy as np
+import pdb
+from tqdm import tqdm
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+class VideoEncoder(pl.LightningModule):
+    def __init__(self, text_encoder_path, vae_path, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        super().__init__()
+        model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+        model_manager.load_models([text_encoder_path, vae_path])
+        self.pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager)
+        self.tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        self.frame_process = v2.Compose([
+            # v2.CenterCrop(size=(900, 1600)),
+            # v2.Resize(size=(900, 1600), antialias=True),
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+    def crop_and_resize(self, image):
+        width, height = image.size
+        # print(width,height)
+        width_ori, height_ori_ = 832 , 480
+        image = v2.functional.resize(
+            image,
+            (round(height_ori_), round(width_ori)),
+            interpolation=v2.InterpolationMode.BILINEAR
+        )
+        return image
+    def load_video_frames(self, video_path):
+        """加载完整视频"""
+        reader = imageio.get_reader(video_path)
+        frames = []
+        for frame_data in reader:
+            frame = Image.fromarray(frame_data)
+            frame = self.crop_and_resize(frame)
+            frame = self.frame_process(frame)
+            frames.append(frame)
+        reader.close()
+        if len(frames) == 0:
+            return None
+        frames = torch.stack(frames, dim=0)
+        frames = rearrange(frames, "T C H W -> C T H W")
+        return frames
+def encode_scenes(scenes_path, text_encoder_path, vae_path,output_dir):
+    """编码所有场景的视频"""
+    encoder = VideoEncoder(text_encoder_path, vae_path)
+    encoder = encoder.cuda()
+    encoder.pipe.device = "cuda"
+    processed_count = 0
+    prompt_emb = 0
+    os.makedirs(output_dir,exist_ok=True)
+    for i, scene_name in tqdm(enumerate(os.listdir(scenes_path)),total=len(os.listdir(scenes_path))):
+        # if i < 1700:
+        #     continue
+        scene_dir = os.path.join(scenes_path, scene_name)
+        save_dir = os.path.join(output_dir,scene_name.split('.')[0])
+        # print('in:',scene_dir)
+        # print('out:',save_dir)
+        if not scene_dir.endswith(".mp4"):# or os.path.isdir(output_dir):
+            continue
+        os.makedirs(save_dir,exist_ok=True)
+        # 检查是否已编码
+        encoded_path = os.path.join(save_dir, "encoded_video.pth")
+        if os.path.exists(encoded_path):
+            print(f"Scene {scene_name} already encoded, skipping...")
+            continue
+        # 加载场景信息
+        scene_cam_path = scene_dir.replace(".mp4", ".npz")
+        if not os.path.exists(scene_cam_path):
+            continue
+        with np.load(scene_cam_path) as data:
+            cam_data = data.files
+            cam_emb = {k: data[k].cpu() if isinstance(data[k], torch.Tensor) else data[k] for k in cam_data}
+        # with open(scene_cam_path, 'rb') as f:
+        #     cam_data = np.load(f)  # 此时cam_data仅包含数据，无文件句柄引用
+        # 加载视频
+        video_path = scene_dir
+        if not os.path.exists(video_path):
+            print(f"Video not found: {video_path}")
+            continue
+        # try:
+        print(f"Encoding scene {scene_name}...")
+        # 加载和编码视频
+        video_frames = encoder.load_video_frames(video_path)
+        if video_frames is None:
+            print(f"Failed to load video: {video_path}")
+            continue
+        video_frames = video_frames.unsqueeze(0).to("cuda", dtype=torch.bfloat16)
+        print('video shape:',video_frames.shape)
+        # 编码视频
+        with torch.no_grad():
+            latents = encoder.pipe.encode_video(video_frames, **encoder.tiler_kwargs)[0]
+            # 编码文本
+            if processed_count == 0:
+                print('encode prompt!!!')
+                prompt_emb = encoder.pipe.encode_prompt("A video of a scene shot using a pedestrian's front camera while walking")
+                del encoder.pipe.prompter
+            # pdb.set_trace()
+            # 保存编码结果
+            encoded_data = {
+                "latents": latents.cpu(),
+                #"prompt_emb": {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in prompt_emb.items()},
+                "cam_emb": cam_emb
+            }
+            # pdb.set_trace()
+            torch.save(encoded_data, encoded_path)
+            print(f"Saved encoded data: {encoded_path}")
+            processed_count += 1
+        # except Exception as e:
+        #     print(f"Error encoding scene {scene_name}: {e}")
+        #     continue
+    print(f"Encoding completed! Processed {processed_count} scenes.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--scenes_path", type=str, default="/share_zhuyixuan05/public_datasets/sekai/Sekai-Project/sekai-game-walking")
+    parser.add_argument("--text_encoder_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth")
+    parser.add_argument("--vae_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth")
+    parser.add_argument("--output_dir",type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/sekai-game-walking")
+    args = parser.parse_args()
+    encode_scenes(args.scenes_path, args.text_encoder_path, args.vae_path,args.output_dir)

scripts/encode_sekai_walking.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import os
+import torch
+import lightning as pl
+from PIL import Image
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import json
+import imageio
+from torchvision.transforms import v2
+from einops import rearrange
+import argparse
+import numpy as np
+import pdb
+from tqdm import tqdm
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+class VideoEncoder(pl.LightningModule):
+    def __init__(self, text_encoder_path, vae_path, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        super().__init__()
+        model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+        model_manager.load_models([text_encoder_path, vae_path])
+        self.pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager)
+        self.tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        self.frame_process = v2.Compose([
+            # v2.CenterCrop(size=(900, 1600)),
+            # v2.Resize(size=(900, 1600), antialias=True),
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+    def crop_and_resize(self, image):
+        width, height = image.size
+        # print(width,height)
+        width_ori, height_ori_ = 832 , 480
+        image = v2.functional.resize(
+            image,
+            (round(height_ori_), round(width_ori)),
+            interpolation=v2.InterpolationMode.BILINEAR
+        )
+        return image
+    def load_video_frames(self, video_path):
+        """加载完整视频"""
+        reader = imageio.get_reader(video_path)
+        frames = []
+        for frame_data in reader:
+            frame = Image.fromarray(frame_data)
+            frame = self.crop_and_resize(frame)
+            frame = self.frame_process(frame)
+            frames.append(frame)
+        reader.close()
+        if len(frames) == 0:
+            return None
+        frames = torch.stack(frames, dim=0)
+        frames = rearrange(frames, "T C H W -> C T H W")
+        return frames
+def encode_scenes(scenes_path, text_encoder_path, vae_path,output_dir):
+    """编码所有场景的视频"""
+    encoder = VideoEncoder(text_encoder_path, vae_path)
+    encoder = encoder.cuda()
+    encoder.pipe.device = "cuda"
+    processed_count = 0
+    processed_chunk_count = 0
+    prompt_emb = 0
+    os.makedirs(output_dir,exist_ok=True)
+    chunk_size = 300
+    for i, scene_name in tqdm(enumerate(os.listdir(scenes_path)),total=len(os.listdir(scenes_path))):
+        # print('index-----:',type(i))
+        # if i < 3000 :#or i >=2000:
+        #     # print('index-----:',i)
+        #     continue
+            # print('index:',i)
+        print('index:',i)
+        scene_dir = os.path.join(scenes_path, scene_name)
+        # save_dir = os.path.join(output_dir,scene_name.split('.')[0])
+        # print('in:',scene_dir)
+        # print('out:',save_dir)
+        if not scene_dir.endswith(".mp4"):# or os.path.isdir(output_dir):
+            continue
+        scene_cam_path = scene_dir.replace(".mp4", ".npz")
+        if not os.path.exists(scene_cam_path):
+            continue
+        with np.load(scene_cam_path) as data:
+            cam_data = data.files
+            cam_emb = {k: data[k].cpu() if isinstance(data[k], torch.Tensor) else data[k] for k in cam_data}
+        # with open(scene_cam_path, 'rb') as f:
+        #     cam_data = np.load(f)  # 此时cam_data仅包含数据，无文件句柄引用
+        video_name = scene_name[:-4].split('_')[0]
+        start_frame = int(scene_name[:-4].split('_')[1])
+        end_frame = int(scene_name[:-4].split('_')[2])
+        sampled_range = range(start_frame, end_frame , chunk_size)
+        sampled_frames = list(sampled_range)
+        sampled_chunk_end = sampled_frames[0] + 300
+        start_str = f"{sampled_frames[0]:07d}"
+        end_str = f"{sampled_chunk_end:07d}"
+        chunk_name = f"{video_name}_{start_str}_{end_str}"
+        save_chunk_path = os.path.join(output_dir,chunk_name,"encoded_video.pth")
+        if os.path.exists(save_chunk_path):
+            print(f"Video {video_name} already encoded, skipping...")
+            continue
+        # 加载视频
+        video_path = scene_dir
+        if not os.path.exists(video_path):
+            print(f"Video not found: {video_path}")
+            continue
+        video_frames = encoder.load_video_frames(video_path)
+        if video_frames is None:
+            print(f"Failed to load video: {video_path}")
+            continue
+        video_frames = video_frames.unsqueeze(0).to("cuda", dtype=torch.bfloat16)
+        print('video shape:',video_frames.shape)
+        # print(sampled_frames)
+        print(f"Encoding scene {scene_name}...")
+        for sampled_chunk_start in sampled_frames:
+            sampled_chunk_end = sampled_chunk_start + 300
+            start_str = f"{sampled_chunk_start:07d}"
+            end_str = f"{sampled_chunk_end:07d}"
+            # 生成保存目录名（假设video_name已定义）
+            chunk_name = f"{video_name}_{start_str}_{end_str}"
+            save_chunk_dir = os.path.join(output_dir,chunk_name)
+            os.makedirs(save_chunk_dir,exist_ok=True)
+            print(f"Encoding chunk {chunk_name}...")
+            encoded_path = os.path.join(save_chunk_dir, "encoded_video.pth")
+            if os.path.exists(encoded_path):
+                print(f"Chunk {chunk_name} already encoded, skipping...")
+                continue
+            chunk_frames = video_frames[:,:, sampled_chunk_start - start_frame : sampled_chunk_end - start_frame,...]
+            # print('extrinsic:',cam_emb['extrinsic'].shape)
+            chunk_cam_emb ={'extrinsic':cam_emb['extrinsic'][sampled_chunk_start - start_frame : sampled_chunk_end - start_frame],
+                            'intrinsic':cam_emb['intrinsic']}
+            # print('chunk shape:',chunk_frames.shape)
+            with torch.no_grad():
+                latents = encoder.pipe.encode_video(chunk_frames, **encoder.tiler_kwargs)[0]
+                # 编码文本
+                # if processed_count == 0:
+                #     print('encode prompt!!!')
+                #     prompt_emb = encoder.pipe.encode_prompt("A video of a scene shot using a pedestrian's front camera while walking")
+                #     del encoder.pipe.prompter
+                # pdb.set_trace()
+                # 保存编码结果
+                encoded_data = {
+                    "latents": latents.cpu(),
+                    # "prompt_emb": {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in prompt_emb.items()},
+                    "cam_emb": chunk_cam_emb
+                }
+                # pdb.set_trace()
+                torch.save(encoded_data, encoded_path)
+                print(f"Saved encoded data: {encoded_path}")
+            processed_chunk_count += 1
+        processed_count += 1
+        print("Encoded scene numebr:",processed_count)
+        print("Encoded chunk numebr:",processed_chunk_count)
+        # os.makedirs(save_dir,exist_ok=True)
+        # # 检查是否已编码
+        # encoded_path = os.path.join(save_dir, "encoded_video.pth")
+        # if os.path.exists(encoded_path):
+        #     print(f"Scene {scene_name} already encoded, skipping...")
+        #     continue
+        # 加载场景信息
+        # try:
+        # print(f"Encoding scene {scene_name}...")
+        # 加载和编码视频
+        # 编码视频
+        # with torch.no_grad():
+        #     latents = encoder.pipe.encode_video(video_frames, **encoder.tiler_kwargs)[0]
+        #     # 编码文本
+        #     if processed_count == 0:
+        #         print('encode prompt!!!')
+        #         prompt_emb = encoder.pipe.encode_prompt("A video of a scene shot using a pedestrian's front camera while walking")
+        #         del encoder.pipe.prompter
+        #     # pdb.set_trace()
+        #     # 保存编码结果
+        #     encoded_data = {
+        #         "latents": latents.cpu(),
+        #         #"prompt_emb": {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in prompt_emb.items()},
+        #         "cam_emb": cam_emb
+        #     }
+        #     # pdb.set_trace()
+        #     torch.save(encoded_data, encoded_path)
+        #     print(f"Saved encoded data: {encoded_path}")
+        #     processed_count += 1
+        # except Exception as e:
+        #     print(f"Error encoding scene {scene_name}: {e}")
+        #     continue
+    print(f"Encoding completed! Processed {processed_count} scenes.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--scenes_path", type=str, default="/share_zhuyixuan05/public_datasets/sekai/Sekai-Project/sekai-game-walking")
+    parser.add_argument("--text_encoder_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth")
+    parser.add_argument("--vae_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth")
+    parser.add_argument("--output_dir",type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/sekai-game-walking")
+    args = parser.parse_args()
+    encode_scenes(args.scenes_path, args.text_encoder_path, args.vae_path,args.output_dir)

scripts/encode_spatialvid.py ADDED Viewed

	@@ -0,0 +1,409 @@

+import os
+import torch
+import lightning as pl
+from PIL import Image
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import json
+import imageio
+from torchvision.transforms import v2
+from einops import rearrange
+import argparse
+import numpy as np
+import pdb
+from tqdm import tqdm
+import pandas as pd
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from scipy.spatial.transform import Slerp
+from scipy.spatial.transform import Rotation as R
+def interpolate_camera_poses(original_frames, original_poses, target_frames):
+    """
+    对相机姿态进行插值，生成目标帧对应的姿态参数
+    参数:
+    original_frames: 原始帧索引列表，如[0,6,12,...]
+    original_poses: 原始姿态数组，形状为(n,7)，每行[tx, ty, tz, qx, qy, qz, qw]
+    target_frames: 目标帧索引列表，如[0,4,8,12,...]
+    返回:
+    target_poses: 插值后的姿态数组，形状为(m,7),m为目标帧数量
+    """
+    # 确保输入有效
+    print('original_frames:',len(original_frames))
+    print('original_poses:',len(original_poses))
+    if len(original_frames) != len(original_poses):
+        raise ValueError("原始帧数量与姿态数量不匹配")
+    if original_poses.shape[1] != 7:
+        raise ValueError(f"原始姿态应为(n,7)格式，实际为{original_poses.shape}")
+    target_poses = []
+    # 提取旋转部分并转换为Rotation对象
+    rotations = R.from_quat(original_poses[:, 3:7])  # 提取四元数部分
+    for t in target_frames:
+        # 找到t前后的原始帧索引
+        idx = np.searchsorted(original_frames, t, side='left')
+        # 处理边界情况
+        if idx == 0:
+            # 使用第一个姿态
+            target_poses.append(original_poses[0])
+            continue
+        if idx >= len(original_frames):
+            # 使用最后一个姿态
+            target_poses.append(original_poses[-1])
+            continue
+        # 获取前后帧的信息
+        t_prev, t_next = original_frames[idx-1], original_frames[idx]
+        pose_prev, pose_next = original_poses[idx-1], original_poses[idx]
+        # 计算插值权重
+        alpha = (t - t_prev) / (t_next - t_prev)
+        # 1. 平移向量的线性插值
+        translation_prev = pose_prev[:3]
+        translation_next = pose_next[:3]
+        interpolated_translation = translation_prev + alpha * (translation_next - translation_prev)
+        # 2. 旋转四元数的球面线性插值(SLERP)
+        # 创建Slerp对象
+        slerp = Slerp([t_prev, t_next], rotations[idx-1:idx+1])
+        interpolated_rotation = slerp(t)
+        # 组合平移和旋转
+        interpolated_pose = np.concatenate([
+            interpolated_translation,
+            interpolated_rotation.as_quat()  # 转换回四元数
+        ])
+        target_poses.append(interpolated_pose)
+    return np.array(target_poses)
+class VideoEncoder(pl.LightningModule):
+    def __init__(self, text_encoder_path, vae_path, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        super().__init__()
+        model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+        model_manager.load_models([text_encoder_path, vae_path])
+        self.pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager)
+        self.tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        self.frame_process = v2.Compose([
+            # v2.CenterCrop(size=(900, 1600)),
+            # v2.Resize(size=(900, 1600), antialias=True),
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+    def crop_and_resize(self, image):
+        width, height = image.size
+        # print(width,height)
+        width_ori, height_ori_ = 832 , 480
+        image = v2.functional.resize(
+            image,
+            (round(height_ori_), round(width_ori)),
+            interpolation=v2.InterpolationMode.BILINEAR
+        )
+        return image
+    def load_video_frames(self, video_path):
+        """加载完整视频"""
+        reader = imageio.get_reader(video_path)
+        frames = []
+        for frame_data in reader:
+            frame = Image.fromarray(frame_data)
+            frame = self.crop_and_resize(frame)
+            frame = self.frame_process(frame)
+            frames.append(frame)
+        reader.close()
+        if len(frames) == 0:
+            return None
+        frames = torch.stack(frames, dim=0)
+        frames = rearrange(frames, "T C H W -> C T H W")
+        return frames
+def encode_scenes(scenes_path, text_encoder_path, vae_path,output_dir):
+    """编码所有场景的视频"""
+    encoder = VideoEncoder(text_encoder_path, vae_path)
+    encoder = encoder.cuda()
+    encoder.pipe.device = "cuda"
+    processed_count = 0
+    processed_chunk_count = 0
+    prompt_emb = 0
+    metadata = pd.read_csv('/share_zhuyixuan05/public_datasets/SpatialVID-HQ/data/train/SpatialVID_HQ_metadata.csv')
+    os.makedirs(output_dir,exist_ok=True)
+    chunk_size = 300
+    required_keys = ["latents", "cam_emb", "prompt_emb"]
+    for i, scene_name in enumerate(os.listdir(scenes_path)):
+        # print('index-----:',type(i))
+        if i < 3 :#or i >=2000:
+        #     # print('index-----:',i)
+            continue
+            # print('index:',i)
+        print('group:',i)
+        scene_dir = os.path.join(scenes_path, scene_name)
+        # save_dir = os.path.join(output_dir,scene_name.split('.')[0])
+        print('in:',scene_dir)
+        # print('out:',save_dir)
+        for j, video_name in tqdm(enumerate(os.listdir(scene_dir)),total=len(os.listdir(scene_dir))):
+            # if j < 1000 :#or i >=2000:
+                # print('index:',j)
+                # continue
+            print(video_name)
+            video_path = os.path.join(scene_dir, video_name)
+            if not video_path.endswith(".mp4"):# or os.path.isdir(output_dir):
+                continue
+            video_info = metadata[metadata['id'] == video_name[:-4]]
+            num_frames = video_info['num frames'].iloc[0]
+            scene_cam_dir = video_path.replace( "videos","annotations")[:-4]
+            scene_cam_path = os.path.join(scene_cam_dir,'poses.npy')
+            scene_caption_path = os.path.join(scene_cam_dir,'caption.json')
+            with open(scene_caption_path, 'r', encoding='utf-8') as f:
+                caption_data = json.load(f)
+                caption = caption_data["SceneSummary"]
+            if not os.path.exists(scene_cam_path):
+                print(f"Pose not found: {scene_cam_path}")
+                continue
+            camera_poses = np.load(scene_cam_path)
+            cam_data_len = camera_poses.shape[0]
+                # cam_emb = {k: data[k].cpu() if isinstance(data[k], torch.Tensor) else data[k] for k in cam_data}
+            # with open(scene_cam_path, 'rb') as f:
+            #     cam_data = np.load(f)  # 此时cam_data仅包含数据，无文件句柄引用
+            # 加载视频
+            # video_path = scene_dir
+            if not os.path.exists(video_path):
+                print(f"Video not found: {video_path}")
+                continue
+            start_str = f"{0:07d}"
+            end_str = f"{chunk_size:07d}"
+            chunk_name = f"{video_name[:-4]}_{start_str}_{end_str}"
+            first_save_chunk_dir = os.path.join(output_dir,chunk_name)
+            first_chunk_encoded_path = os.path.join(first_save_chunk_dir, "encoded_video.pth")
+            # print(first_chunk_encoded_path)
+            if os.path.exists(first_chunk_encoded_path):
+                data = torch.load(first_chunk_encoded_path,weights_only=False)
+                if 'latents' in data:
+                    video_frames = 1
+            else:
+                video_frames = encoder.load_video_frames(video_path)
+                if video_frames is None:
+                    print(f"Failed to load video: {video_path}")
+                    continue
+                print('video shape:',video_frames.shape)
+                video_frames = video_frames.unsqueeze(0).to("cuda", dtype=torch.bfloat16)
+                print('video shape:',video_frames.shape)
+            video_name = video_name[:-4].split('_')[0]
+            start_frame = 0
+            end_frame = num_frames
+            # print("num_frames:",num_frames)
+            cam_interval = end_frame // (cam_data_len - 1)
+            cam_frames = np.linspace(start_frame, end_frame, cam_data_len, endpoint=True)
+            cam_frames = np.round(cam_frames).astype(int)
+            cam_frames = cam_frames.tolist()
+            # list(range(0, end_frame + 1 , cam_interval))
+            sampled_range = range(start_frame, end_frame , chunk_size)
+            sampled_frames = list(sampled_range)
+            sampled_chunk_end = sampled_frames[0] + chunk_size
+            start_str = f"{sampled_frames[0]:07d}"
+            end_str = f"{sampled_chunk_end:07d}"
+            chunk_name = f"{video_name}_{start_str}_{end_str}"
+            # save_chunk_path = os.path.join(output_dir,chunk_name,"encoded_video.pth")
+            # if os.path.exists(save_chunk_path):
+            #     print(f"Video {video_name} already encoded, skipping...")
+            #     continue
+            # print(sampled_frames)
+            print(f"Encoding scene {video_name}...")
+            chunk_count_in_one_video = 0
+            for sampled_chunk_start in sampled_frames:
+                if num_frames - sampled_chunk_start < 100:
+                    continue
+                sampled_chunk_end = sampled_chunk_start + chunk_size
+                start_str = f"{sampled_chunk_start:07d}"
+                end_str = f"{sampled_chunk_end:07d}"
+                resample_cam_frame = list(range(sampled_chunk_start, sampled_chunk_end , 4))
+                # 生成保存目录名（假设video_name已定义）
+                chunk_name = f"{video_name}_{start_str}_{end_str}"
+                save_chunk_dir = os.path.join(output_dir,chunk_name)
+                os.makedirs(save_chunk_dir,exist_ok=True)
+                print(f"Encoding chunk {chunk_name}...")
+                encoded_path = os.path.join(save_chunk_dir, "encoded_video.pth")
+                missing_keys = required_keys
+                if os.path.exists(encoded_path):
+                    print('error:',encoded_path)
+                    data = torch.load(encoded_path,weights_only=False)
+                    missing_keys = [key for key in required_keys if key not in data]
+                    # print(missing_keys)
+                    # print(f"Chunk {chunk_name} already encoded, skipping...")
+                    if missing_keys:
+                        print(f"警告: 文件中缺少以下必要元素: {missing_keys}")
+                    if len(missing_keys) == 0 :
+                        continue
+                else:
+                    print(f"警告: 缺少pth文件: {encoded_path}")
+                    if not isinstance(video_frames, torch.Tensor):
+                        video_frames = encoder.load_video_frames(video_path)
+                        if video_frames is None:
+                            print(f"Failed to load video: {video_path}")
+                            continue
+                        video_frames = video_frames.unsqueeze(0).to("cuda", dtype=torch.bfloat16)
+                    print('video shape:',video_frames.shape)
+                if "latents" in missing_keys:
+                    chunk_frames = video_frames[:,:, sampled_chunk_start - start_frame : sampled_chunk_end - start_frame,...]
+                    # print('extrinsic:',cam_emb['extrinsic'].shape)
+                    # chunk_cam_emb ={'extrinsic':cam_emb['extrinsic'][sampled_chunk_start - start_frame : sampled_chunk_end - start_frame],
+                    #                 'intrinsic':cam_emb['intrinsic']}
+                    # print('chunk shape:',chunk_frames.shape)
+                    with torch.no_grad():
+                        latents = encoder.pipe.encode_video(chunk_frames, **encoder.tiler_kwargs)[0]
+                else:
+                    latents = data['latents']
+                if "cam_emb" in missing_keys:
+                    cam_emb = interpolate_camera_poses(cam_frames, camera_poses,resample_cam_frame)
+                    chunk_cam_emb ={'extrinsic':cam_emb}
+                    print(f"视频长度:{chunk_size},重采样相机长度:{cam_emb.shape[0]}")
+                else:
+                    chunk_cam_emb = data['cam_emb']
+                if "prompt_emb" in missing_keys:
+                    # 编码文本
+                    if chunk_count_in_one_video == 0:
+                        print(caption)
+                        with torch.no_grad():
+                            prompt_emb = encoder.pipe.encode_prompt(caption)
+                else:
+                    prompt_emb = data['prompt_emb']
+                    #     del encoder.pipe.prompter
+                    # pdb.set_trace()
+                    # 保存编码结果
+                encoded_data = {
+                        "latents": latents.cpu(),
+                        "prompt_emb": {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in prompt_emb.items()},
+                        "cam_emb": chunk_cam_emb
+                    }
+                    # pdb.set_trace()
+                torch.save(encoded_data, encoded_path)
+                print(f"Saved encoded data: {encoded_path}")
+                processed_chunk_count += 1
+                chunk_count_in_one_video += 1
+        processed_count += 1
+        print("Encoded scene numebr:",processed_count)
+        print("Encoded chunk numebr:",processed_chunk_count)
+        # os.makedirs(save_dir,exist_ok=True)
+        # # 检查是否已编码
+        # encoded_path = os.path.join(save_dir, "encoded_video.pth")
+        # if os.path.exists(encoded_path):
+        #     print(f"Scene {scene_name} already encoded, skipping...")
+        #     continue
+        # 加载场景信息
+        # try:
+        # print(f"Encoding scene {scene_name}...")
+        # 加载和编码视频
+        # 编码视频
+        # with torch.no_grad():
+        #     latents = encoder.pipe.encode_video(video_frames, **encoder.tiler_kwargs)[0]
+        #     # 编码文本
+        #     if processed_count == 0:
+        #         print('encode prompt!!!')
+        #         prompt_emb = encoder.pipe.encode_prompt("A video of a scene shot using a pedestrian's front camera while walking")
+        #         del encoder.pipe.prompter
+        #     # pdb.set_trace()
+        #     # 保存编码结果
+        #     encoded_data = {
+        #         "latents": latents.cpu(),
+        #         #"prompt_emb": {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in prompt_emb.items()},
+        #         "cam_emb": cam_emb
+        #     }
+        #     # pdb.set_trace()
+        #     torch.save(encoded_data, encoded_path)
+        #     print(f"Saved encoded data: {encoded_path}")
+        #     processed_count += 1
+        # except Exception as e:
+        #     print(f"Error encoding scene {scene_name}: {e}")
+        #     continue
+    print(f"Encoding completed! Processed {processed_count} scenes.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--scenes_path", type=str, default="/share_zhuyixuan05/public_datasets/SpatialVID-HQ/SpatialVid/HQ/videos/")
+    parser.add_argument("--text_encoder_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth")
+    parser.add_argument("--vae_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth")
+    parser.add_argument("--output_dir",type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/spatialvid")
+    args = parser.parse_args()
+    encode_scenes(args.scenes_path, args.text_encoder_path, args.vae_path,args.output_dir)

scripts/encode_spatialvid_first_frame.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import os
+import torch
+import lightning as pl
+from PIL import Image
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import json
+import imageio
+from torchvision.transforms import v2
+from einops import rearrange
+import argparse
+import numpy as np
+import pdb
+from tqdm import tqdm
+import pandas as pd
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from scipy.spatial.transform import Slerp
+from scipy.spatial.transform import Rotation as R
+def interpolate_camera_poses(original_frames, original_poses, target_frames):
+    """
+    对相机姿态进行插值，生成目标帧对应的姿态参数
+    参数:
+    original_frames: 原始帧索引列表，如[0,6,12,...]
+    original_poses: 原始姿态数组，形状为(n,7)，每行[tx, ty, tz, qx, qy, qz, qw]
+    target_frames: 目标帧索引列表，如[0,4,8,12,...]
+    返回:
+    target_poses: 插值后的姿态数组，形状为(m,7),m为目标帧数量
+    """
+    # 确保输入有效
+    print('original_frames:',len(original_frames))
+    print('original_poses:',len(original_poses))
+    if len(original_frames) != len(original_poses):
+        raise ValueError("原始帧数量与姿态数量不匹配")
+    if original_poses.shape[1] != 7:
+        raise ValueError(f"原始姿态应为(n,7)格式，实际为{original_poses.shape}")
+    target_poses = []
+    # 提取旋转部分并转换为Rotation对象
+    rotations = R.from_quat(original_poses[:, 3:7])  # 提取四元数部分
+    for t in target_frames:
+        # 找到t前后的原始帧索引
+        idx = np.searchsorted(original_frames, t, side='left')
+        # 处理边界情况
+        if idx == 0:
+            # 使用第一个姿态
+            target_poses.append(original_poses[0])
+            continue
+        if idx >= len(original_frames):
+            # 使用最后一个姿态
+            target_poses.append(original_poses[-1])
+            continue
+        # 获取前后帧的信息
+        t_prev, t_next = original_frames[idx-1], original_frames[idx]
+        pose_prev, pose_next = original_poses[idx-1], original_poses[idx]
+        # 计算插值权重
+        alpha = (t - t_prev) / (t_next - t_prev)
+        # 1. 平移向量的线性插值
+        translation_prev = pose_prev[:3]
+        translation_next = pose_next[:3]
+        interpolated_translation = translation_prev + alpha * (translation_next - translation_prev)
+        # 2. 旋转四元数的球面线性插值(SLERP)
+        # 创建Slerp对象
+        slerp = Slerp([t_prev, t_next], rotations[idx-1:idx+1])
+        interpolated_rotation = slerp(t)
+        # 组合平移和旋转
+        interpolated_pose = np.concatenate([
+            interpolated_translation,
+            interpolated_rotation.as_quat()  # 转换回四元数
+        ])
+        target_poses.append(interpolated_pose)
+    return np.array(target_poses)
+class VideoEncoder(pl.LightningModule):
+    def __init__(self, text_encoder_path, vae_path, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        super().__init__()
+        model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+        model_manager.load_models([text_encoder_path, vae_path])
+        self.pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager)
+        self.tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        self.frame_process = v2.Compose([
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+    def crop_and_resize(self, image):
+        width, height = image.size
+        width_ori, height_ori_ = 832 , 480
+        image = v2.functional.resize(
+            image,
+            (round(height_ori_), round(width_ori)),
+            interpolation=v2.InterpolationMode.BILINEAR
+        )
+        return image
+    def load_single_frame(self, video_path, frame_idx):
+        """只加载指定的单帧"""
+        reader = imageio.get_reader(video_path)
+        try:
+            # 直接跳转到指定帧
+            frame_data = reader.get_data(frame_idx)
+            frame = Image.fromarray(frame_data)
+            frame = self.crop_and_resize(frame)
+            frame = self.frame_process(frame)
+            # 添加batch和time维度: [C, H, W] -> [1, C, 1, H, W]
+            frame = frame.unsqueeze(0).unsqueeze(2)
+        except Exception as e:
+            print(f"Error loading frame {frame_idx} from {video_path}: {e}")
+            return None
+        finally:
+            reader.close()
+        return frame
+    def load_video_frames(self, video_path):
+        """加载完整视频（保留用于兼容性）"""
+        reader = imageio.get_reader(video_path)
+        frames = []
+        for frame_data in reader:
+            frame = Image.fromarray(frame_data)
+            frame = self.crop_and_resize(frame)
+            frame = self.frame_process(frame)
+            frames.append(frame)
+        reader.close()
+        if len(frames) == 0:
+            return None
+        frames = torch.stack(frames, dim=0)
+        frames = rearrange(frames, "T C H W -> C T H W")
+        return frames
+def encode_scenes(scenes_path, text_encoder_path, vae_path,output_dir):
+    """编码所有场景的视频"""
+    encoder = VideoEncoder(text_encoder_path, vae_path)
+    encoder = encoder.cuda()
+    encoder.pipe.device = "cuda"
+    processed_count = 0
+    processed_chunk_count = 0
+    metadata = pd.read_csv('/share_zhuyixuan05/public_datasets/SpatialVID-HQ/data/train/SpatialVID_HQ_metadata.csv')
+    os.makedirs(output_dir,exist_ok=True)
+    chunk_size = 300
+    for i, scene_name in enumerate(os.listdir(scenes_path)):
+        if i < 2:
+            continue
+        print('group:',i)
+        scene_dir = os.path.join(scenes_path, scene_name)
+        print('in:',scene_dir)
+        for j, video_name in tqdm(enumerate(os.listdir(scene_dir)),total=len(os.listdir(scene_dir))):
+            print(video_name)
+            video_path = os.path.join(scene_dir, video_name)
+            if not video_path.endswith(".mp4"):
+                continue
+            video_info = metadata[metadata['id'] == video_name[:-4]]
+            num_frames = video_info['num frames'].iloc[0]
+            scene_cam_dir = video_path.replace("videos","annotations")[:-4]
+            scene_cam_path = os.path.join(scene_cam_dir,'poses.npy')
+            scene_caption_path = os.path.join(scene_cam_dir,'caption.json')
+            with open(scene_caption_path, 'r', encoding='utf-8') as f:
+                caption_data = json.load(f)
+                caption = caption_data["SceneSummary"]
+            if not os.path.exists(scene_cam_path):
+                print(f"Pose not found: {scene_cam_path}")
+                continue
+            camera_poses = np.load(scene_cam_path)
+            cam_data_len = camera_poses.shape[0]
+            if not os.path.exists(video_path):
+                print(f"Video not found: {video_path}")
+                continue
+            video_name = video_name[:-4].split('_')[0]
+            start_frame = 0
+            end_frame = num_frames
+            cam_interval = end_frame // (cam_data_len - 1)
+            cam_frames = np.linspace(start_frame, end_frame, cam_data_len, endpoint=True)
+            cam_frames = np.round(cam_frames).astype(int)
+            cam_frames = cam_frames.tolist()
+            sampled_range = range(start_frame, end_frame, chunk_size)
+            sampled_frames = list(sampled_range)
+            print(f"Encoding scene {video_name}...")
+            chunk_count_in_one_video = 0
+            for sampled_chunk_start in sampled_frames:
+                if num_frames - sampled_chunk_start < 100:
+                    continue
+                sampled_chunk_end = sampled_chunk_start + chunk_size
+                start_str = f"{sampled_chunk_start:07d}"
+                end_str = f"{sampled_chunk_end:07d}"
+                chunk_name = f"{video_name}_{start_str}_{end_str}"
+                save_chunk_dir = os.path.join(output_dir, chunk_name)
+                os.makedirs(save_chunk_dir, exist_ok=True)
+                print(f"Encoding chunk {chunk_name}...")
+                first_latent_path = os.path.join(save_chunk_dir, "first_latent.pth")
+                if os.path.exists(first_latent_path):
+                    print(f"First latent for chunk {chunk_name} already exists, skipping...")
+                    continue
+                # 只加载需要的那一帧
+                first_frame_idx = sampled_chunk_start
+                print(f"first_frame:{first_frame_idx}")
+                first_frame = encoder.load_single_frame(video_path, first_frame_idx)
+                if first_frame is None:
+                    print(f"Failed to load frame {first_frame_idx} from: {video_path}")
+                    continue
+                first_frame = first_frame.to("cuda", dtype=torch.bfloat16)
+                # 重复4次
+                repeated_first_frame = first_frame.repeat(1, 1, 4, 1, 1)
+                print(f"Repeated first frame shape: {repeated_first_frame.shape}")
+                with torch.no_grad():
+                    first_latents = encoder.pipe.encode_video(repeated_first_frame, **encoder.tiler_kwargs)[0]
+                first_latent_data = {
+                    "latents": first_latents.cpu(),
+                }
+                torch.save(first_latent_data, first_latent_path)
+                print(f"Saved first latent: {first_latent_path}")
+                processed_chunk_count += 1
+                chunk_count_in_one_video += 1
+        processed_count += 1
+        print("Encoded scene number:", processed_count)
+        print("Encoded chunk number:", processed_chunk_count)
+    print(f"Encoding completed! Processed {processed_count} scenes.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--scenes_path", type=str, default="/share_zhuyixuan05/public_datasets/SpatialVID-HQ/SpatialVid/HQ/videos/")
+    parser.add_argument("--text_encoder_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth")
+    parser.add_argument("--vae_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth")
+    parser.add_argument("--output_dir",type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/spatialvid")
+    args = parser.parse_args()
+    encode_scenes(args.scenes_path, args.text_encoder_path, args.vae_path,args.output_dir)

scripts/hud_logo.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from PIL import Image, ImageDraw, ImageFont
+import os
+os.makedirs("wasd_ui", exist_ok=True)
+# UI sizes (small)
+key_size = (48, 48)
+corner = 10
+bg_padding = 6
+font = ImageFont.truetype("arial.ttf", 28)  # 替换成本地支持的字体
+def rounded_rect(im, bbox, radius, fill):
+    draw = ImageDraw.Draw(im, "RGBA")
+    draw.rounded_rectangle(bbox, radius=radius, fill=fill)
+# background plate
+bg_width = key_size[0] * 3 + bg_padding * 4
+bg_height = key_size[1] * 2 + bg_padding * 4
+ui_bg = Image.new("RGBA", (bg_width, bg_height), (0,0,0,0))
+rounded_rect(ui_bg, (0,0,bg_width,bg_height), corner, (0,0,0,140))
+ui_bg.save("wasd_ui/ui_background.png")
+keys = ["W","A","S","D"]
+def draw_key(char, active):
+    im = Image.new("RGBA", key_size, (0,0,0,0))
+    rounded_rect(im, (0,0,key_size[0],key_size[1]), corner,
+                 (255,255,255,230) if active else (200,200,200,180))
+    draw = ImageDraw.Draw(im)
+    color = (0,0,0) if active else (50,50,50)
+    w,h = draw.textsize(char, font=font)
+    draw.text(((key_size[0]-w)//2,(key_size[1]-h)//2),
+              char, font=font, fill=color)
+    return im
+for k in keys:
+    draw_key(k, False).save(f"wasd_ui/key_{k}_idle.png")
+    draw_key(k, True).save(f"wasd_ui/key_{k}_active.png")
+print("✅ WASD UI assets generated in ./wasd_ui/")

scripts/infer_demo.py ADDED Viewed

	@@ -0,0 +1,1458 @@

+import os
+import sys
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+sys.path.append(ROOT_DIR)
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+import imageio
+import json
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import argparse
+from torchvision.transforms import v2
+from einops import rearrange
+import random
+import copy
+from datetime import datetime
+def compute_relative_pose_matrix(pose1, pose2):
+    """
+    计算相邻两帧的相对位姿，返回3×4的相机矩阵 [R_rel | t_rel]
+    参数:
+    pose1: 第i帧的相机位姿，形状为(7,)的数组 [tx1, ty1, tz1, qx1, qy1, qz1, qw1]
+    pose2: 第i+1帧的相机位姿，形状为(7,)的数组 [tx2, ty2, tz2, qx2, qy2, qz2, qw2]
+    返回:
+    relative_matrix: 3×4的相对位姿矩阵，前3列是旋转矩阵R_rel，第4列是平移向量t_rel
+    """
+    # 分离平移向量和四元数
+    t1 = pose1[:3]  # 第i帧平移 [tx1, ty1, tz1]
+    q1 = pose1[3:]  # 第i帧四元数 [qx1, qy1, qz1, qw1]
+    t2 = pose2[:3]  # 第i+1帧平移
+    q2 = pose2[3:]  # 第i+1帧四元数
+    # 1. 计算相对旋转矩阵 R_rel
+    rot1 = R.from_quat(q1)  # 第i帧旋转
+    rot2 = R.from_quat(q2)  # 第i+1帧旋转
+    rot_rel = rot2 * rot1.inv()  # 相对旋转 = 后一帧旋转 × 前一帧旋转的逆
+    R_rel = rot_rel.as_matrix()  # 转换为3×3矩阵
+    # 2. 计算相对平移向量 t_rel
+    R1_T = rot1.as_matrix().T  # 前一帧旋转矩阵的转置（等价于逆）
+    t_rel = R1_T @ (t2 - t1)   # 相对平移 = R1^T × (t2 - t1)
+    # 3. 组合为3×4矩阵 [R_rel | t_rel]
+    relative_matrix = np.hstack([R_rel, t_rel.reshape(3, 1)])
+    return relative_matrix
+def load_encoded_video_from_pth(pth_path, start_frame=0, num_frames=10):
+    """从pth文件加载预编码的视频数据"""
+    print(f"Loading encoded video from {pth_path}")
+    encoded_data = torch.load(pth_path, weights_only=False, map_location="cpu")
+    full_latents = encoded_data['latents']  # [C, T, H, W]
+    print(f"Full latents shape: {full_latents.shape}")
+    print(f"Extracting frames {start_frame} to {start_frame + num_frames}")
+    if start_frame + num_frames > full_latents.shape[1]:
+        raise ValueError(f"Not enough frames: requested {start_frame + num_frames}, available {full_latents.shape[1]}")
+    condition_latents = full_latents[:, start_frame:start_frame + num_frames, :, :]
+    print(f"Extracted condition latents shape: {condition_latents.shape}")
+    return condition_latents, encoded_data
+def compute_relative_pose(pose_a, pose_b, use_torch=False):
+    """计算相机B相对于相机A的相对位姿矩阵"""
+    assert pose_a.shape == (4, 4), f"相机A外参矩阵形状应为(4,4)，实际为{pose_a.shape}"
+    assert pose_b.shape == (4, 4), f"相机B外参矩阵形状应为(4,4)，实际为{pose_b.shape}"
+    if use_torch:
+        if not isinstance(pose_a, torch.Tensor):
+            pose_a = torch.from_numpy(pose_a).float()
+        if not isinstance(pose_b, torch.Tensor):
+            pose_b = torch.from_numpy(pose_b).float()
+        pose_a_inv = torch.inverse(pose_a)
+        relative_pose = torch.matmul(pose_b, pose_a_inv)
+    else:
+        if not isinstance(pose_a, np.ndarray):
+            pose_a = np.array(pose_a, dtype=np.float32)
+        if not isinstance(pose_b, np.ndarray):
+            pose_b = np.array(pose_b, dtype=np.float32)
+        pose_a_inv = np.linalg.inv(pose_a)
+        relative_pose = np.matmul(pose_b, pose_a_inv)
+    return relative_pose
+def replace_dit_model_in_manager():
+    """替换DiT模型类为MoE版本"""
+    from diffsynth.models.wan_video_dit_moe import WanModelMoe
+    from diffsynth.configs.model_config import model_loader_configs
+    for i, config in enumerate(model_loader_configs):
+        keys_hash, keys_hash_with_shape, model_names, model_classes, model_resource = config
+        if 'wan_video_dit' in model_names:
+            new_model_names = []
+            new_model_classes = []
+            for name, cls in zip(model_names, model_classes):
+                if name == 'wan_video_dit':
+                    new_model_names.append(name)
+                    new_model_classes.append(WanModelMoe)
+                    print(f"✅ 替换了模型类: {name} -> WanModelMoe")
+                else:
+                    new_model_names.append(name)
+                    new_model_classes.append(cls)
+            model_loader_configs[i] = (keys_hash, keys_hash_with_shape, new_model_names, new_model_classes, model_resource)
+def add_framepack_components(dit_model):
+    """添加FramePack相关组件"""
+    if not hasattr(dit_model, 'clean_x_embedder'):
+        inner_dim = dit_model.blocks[0].self_attn.q.weight.shape[0]
+        class CleanXEmbedder(nn.Module):
+            def __init__(self, inner_dim):
+                super().__init__()
+                self.proj = nn.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
+                self.proj_2x = nn.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
+                self.proj_4x = nn.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
+            def forward(self, x, scale="1x"):
+                if scale == "1x":
+                    x = x.to(self.proj.weight.dtype)
+                    return self.proj(x)
+                elif scale == "2x":
+                    x = x.to(self.proj_2x.weight.dtype)
+                    return self.proj_2x(x)
+                elif scale == "4x":
+                    x = x.to(self.proj_4x.weight.dtype)
+                    return self.proj_4x(x)
+                else:
+                    raise ValueError(f"Unsupported scale: {scale}")
+        dit_model.clean_x_embedder = CleanXEmbedder(inner_dim)
+        model_dtype = next(dit_model.parameters()).dtype
+        dit_model.clean_x_embedder = dit_model.clean_x_embedder.to(dtype=model_dtype)
+        print("✅ 添加了FramePack的clean_x_embedder组件")
+def add_moe_components(dit_model, moe_config):
+    """🔧 添加MoE相关组件 - 修正版本"""
+    if not hasattr(dit_model, 'moe_config'):
+        dit_model.moe_config = moe_config
+        print("✅ 添加了MoE配置到模型")
+    dit_model.top_k = moe_config.get("top_k", 1)
+    # 为每个block动态添加MoE组件
+    dim = dit_model.blocks[0].self_attn.q.weight.shape[0]
+    unified_dim = moe_config.get("unified_dim", 25)
+    num_experts = moe_config.get("num_experts", 4)
+    from diffsynth.models.wan_video_dit_moe import ModalityProcessor, MultiModalMoE
+    dit_model.sekai_processor = ModalityProcessor("sekai", 13, unified_dim)
+    dit_model.nuscenes_processor = ModalityProcessor("nuscenes", 8, unified_dim)
+    dit_model.openx_processor = ModalityProcessor("openx", 13, unified_dim)  # OpenX使用13维输入，类似sekai但独立处理
+    dit_model.global_router = nn.Linear(unified_dim, num_experts)
+    for i, block in enumerate(dit_model.blocks):
+        # MoE网络 - 输入unified_dim，输出dim
+        block.moe = MultiModalMoE(
+            unified_dim=unified_dim,
+            output_dim=dim,  # 输出维度匹配transformer block的dim
+            num_experts=moe_config.get("num_experts", 4),
+            top_k=moe_config.get("top_k", 2)
+        )
+        print(f"✅ Block {i} 添加了MoE组件 (unified_dim: {unified_dim}, experts: {moe_config.get('num_experts', 4)})")
+def generate_sekai_camera_embeddings_sliding(
+    cam_data,
+    start_frame,
+    initial_condition_frames,
+    new_frames,
+    total_generated,
+    use_real_poses=True,
+    direction="left"):
+    """
+    为Sekai数据集生成camera embeddings - 滑动窗口版本
+    Args:
+        cam_data: 包含Sekai相机外参的字典, 键'extrinsic'对应一个N*4*4的numpy数组
+        start_frame: 当前生成起始帧索引
+        initial_condition_frames: 初始条件帧数
+        new_frames: 本次生成的新帧数
+        total_generated: 已生成的总帧数
+        use_real_poses: 是否使用真实的Sekai相机位姿
+        direction: 相机运动方向，默认为"left"
+    Returns:
+        camera_embedding: 形状为(M, 3*4 + 1)的torch张量, M为生成的总帧数
+    """
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    # 1帧初始 + 16帧4x + 2帧2x + 1帧1x + new_frames
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if use_real_poses and cam_data is not None and 'extrinsic' in cam_data:
+        print("🔧 使用真实Sekai camera数据")
+        cam_extrinsic = cam_data['extrinsic']
+        # 确保生成足够长的camera序列
+        max_needed_frames = max(
+            start_frame + initial_condition_frames + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 计算Sekai camera序列长度:")
+        print(f"  - 基础需求: {start_frame + initial_condition_frames + new_frames}")
+        print(f"  - FramePack需求: {framepack_needed_frames}")
+        print(f"  - 最终生成: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # 计算当前帧在原始序列中的位置
+            frame_idx = i * time_compression_ratio
+            next_frame_idx = frame_idx + time_compression_ratio
+            if next_frame_idx < len(cam_extrinsic):
+                cam_prev = cam_extrinsic[frame_idx]
+                cam_next = cam_extrinsic[next_frame_idx]
+                relative_pose = compute_relative_pose(cam_prev, cam_next)
+                relative_poses.append(torch.as_tensor(relative_pose[:3, :]))
+            else:
+                # 超出范围，使用零运动
+                print(f"⚠️ 帧{frame_idx}超出camera数据范围，使用零运动")
+                relative_poses.append(torch.zeros(3, 4))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        # 从start_frame到start_frame+initial_condition_frames标记为condition
+        condition_end = min(start_frame + initial_condition_frames, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 Sekai真实camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        # 确保生成足够长的camera序列
+        max_needed_frames = max(
+            start_frame + initial_condition_frames + new_frames,
+            framepack_needed_frames,
+            30)
+        print(f"🔧 生成Sekai合成camera帧数: {max_needed_frames}")
+        CONDITION_FRAMES = initial_condition_frames
+        STAGE_1 = new_frames//2
+        STAGE_2 = new_frames - STAGE_1
+        if direction=="left":
+            print("--------------- LEFT TURNING MODE ---------------")
+            relative_poses = []
+            for i in range(max_needed_frames):
+                if i < CONDITION_FRAMES:
+                    # 输入的条件帧默认的相机位姿为零运动
+                    pose = np.eye(4, dtype=np.float32)
+                elif i < CONDITION_FRAMES+STAGE_1+STAGE_2:
+                    # 左转
+                    yaw_per_frame = 0.03
+                    # 旋转矩阵
+                    cos_yaw = np.cos(yaw_per_frame)
+                    sin_yaw = np.sin(yaw_per_frame)
+                    # 前进
+                    forward_speed = 0.00
+                    pose = np.eye(4, dtype=np.float32)
+                    pose[0, 0] = cos_yaw
+                    pose[0, 2] = sin_yaw
+                    pose[2, 0] = -sin_yaw
+                    pose[2, 2] = cos_yaw
+                    pose[2, 3] = -forward_speed
+                else:
+                    # 超出条件帧与目标帧的部分，保持静止
+                    pose = np.eye(4, dtype=np.float32)
+                relative_pose = pose[:3, :]
+                relative_poses.append(torch.as_tensor(relative_pose))
+        elif direction=="right":
+            print("--------------- RIGHT TURNING MODE ---------------")
+            relative_poses = []
+            for i in range(max_needed_frames):
+                if i < CONDITION_FRAMES:
+                    # 输入的条件帧默认的相机位姿为零运动
+                    pose = np.eye(4, dtype=np.float32)
+                elif i < CONDITION_FRAMES+STAGE_1+STAGE_2:
+                    # 右转
+                    yaw_per_frame = -0.03
+                    # 旋转矩阵
+                    cos_yaw = np.cos(yaw_per_frame)
+                    sin_yaw = np.sin(yaw_per_frame)
+                    # 前进
+                    forward_speed = 0.00
+                    pose = np.eye(4, dtype=np.float32)
+                    pose[0, 0] = cos_yaw
+                    pose[0, 2] = sin_yaw
+                    pose[2, 0] = -sin_yaw
+                    pose[2, 2] = cos_yaw
+                    pose[2, 3] = -forward_speed
+                else:
+                    # 超出条件帧与目标帧的部分，保持静止
+                    pose = np.eye(4, dtype=np.float32)
+                relative_pose = pose[:3, :]
+                relative_poses.append(torch.as_tensor(relative_pose))
+        elif direction=="forward_left":
+            print("--------------- FORWARD LEFT MODE ---------------")
+            relative_poses = []
+            for i in range(max_needed_frames):
+                if i < CONDITION_FRAMES:
+                    # 输入的条件帧默认的相机位姿为零运动
+                    pose = np.eye(4, dtype=np.float32)
+                elif i < CONDITION_FRAMES+STAGE_1+STAGE_2:
+                    # 左转
+                    yaw_per_frame = 0.03
+                    # 旋转矩阵
+                    cos_yaw = np.cos(yaw_per_frame)
+                    sin_yaw = np.sin(yaw_per_frame)
+                    # 前进
+                    forward_speed = 0.03
+                    pose = np.eye(4, dtype=np.float32)
+                    pose[0, 0] = cos_yaw
+                    pose[0, 2] = sin_yaw
+                    pose[2, 0] = -sin_yaw
+                    pose[2, 2] = cos_yaw
+                    pose[2, 3] = -forward_speed
+                else:
+                    # 超出条件帧与目标帧的部分，保持静止
+                    pose = np.eye(4, dtype=np.float32)
+                relative_pose = pose[:3, :]
+                relative_poses.append(torch.as_tensor(relative_pose))
+        elif direction=="forward_right":
+            print("--------------- FORWARD RIGHT MODE ---------------")
+            relative_poses = []
+            for i in range(max_needed_frames):
+                if i < CONDITION_FRAMES:
+                    # 输入的条件帧默认的相机位姿为零运动
+                    pose = np.eye(4, dtype=np.float32)
+                elif i < CONDITION_FRAMES+STAGE_1+STAGE_2:
+                    # 右转
+                    yaw_per_frame = -0.03
+                    # 旋转矩阵
+                    cos_yaw = np.cos(yaw_per_frame)
+                    sin_yaw = np.sin(yaw_per_frame)
+                    # 前进
+                    forward_speed = 0.03
+                    pose = np.eye(4, dtype=np.float32)
+                    pose[0, 0] = cos_yaw
+                    pose[0, 2] = sin_yaw
+                    pose[2, 0] = -sin_yaw
+                    pose[2, 2] = cos_yaw
+                    pose[2, 3] = -forward_speed
+                else:
+                    # 超出条件帧与目标帧的部分，保持静止
+                    pose = np.eye(4, dtype=np.float32)
+                relative_pose = pose[:3, :]
+                relative_poses.append(torch.as_tensor(relative_pose))
+        elif direction=="s_curve":
+            print("--------------- S CURVE MODE ---------------")
+            relative_poses = []
+            for i in range(max_needed_frames):
+                if i < CONDITION_FRAMES:
+                    # 输入的条件帧默认的相机位姿为零运动
+                    pose = np.eye(4, dtype=np.float32)
+                elif i < CONDITION_FRAMES+STAGE_1:
+                    # 左转
+                    yaw_per_frame = 0.03
+                    # 旋转矩阵
+                    cos_yaw = np.cos(yaw_per_frame)
+                    sin_yaw = np.sin(yaw_per_frame)
+                    # 前进
+                    forward_speed = 0.03
+                    pose = np.eye(4, dtype=np.float32)
+                    pose[0, 0] = cos_yaw
+                    pose[0, 2] = sin_yaw
+                    pose[2, 0] = -sin_yaw
+                    pose[2, 2] = cos_yaw
+                    pose[2, 3] = -forward_speed
+                elif i < CONDITION_FRAMES+STAGE_1+STAGE_2:
+                    # 右转
+                    yaw_per_frame = -0.03
+                    # 旋转矩阵
+                    cos_yaw = np.cos(yaw_per_frame)
+                    sin_yaw = np.sin(yaw_per_frame)
+                    # 前进
+                    forward_speed = 0.03
+                    # 轻微向左漂移，保持惯性
+                    if i < CONDITION_FRAMES+STAGE_1+STAGE_2//3:
+                        radius_shift = -0.01
+                    else:
+                        radius_shift = 0.00
+                    pose = np.eye(4, dtype=np.float32)
+                    pose[0, 0] = cos_yaw
+                    pose[0, 2] = sin_yaw
+                    pose[2, 0] = -sin_yaw
+                    pose[2, 2] = cos_yaw
+                    pose[2, 3] = -forward_speed
+                    pose[0, 3] = radius_shift
+                else:
+                    # 超出条件帧与目标帧的部分，保持静止
+                    pose = np.eye(4, dtype=np.float32)
+                relative_pose = pose[:3, :]
+                relative_poses.append(torch.as_tensor(relative_pose))
+        elif direction=="left_right":
+            print("--------------- LEFT RIGHT MODE ---------------")
+            relative_poses = []
+            for i in range(max_needed_frames):
+                if i < CONDITION_FRAMES:
+                    # 输入的条件帧默认的相机位姿为零运动
+                    pose = np.eye(4, dtype=np.float32)
+                elif i < CONDITION_FRAMES+STAGE_1:
+                    # 左转
+                    yaw_per_frame = 0.03
+                    # 旋转矩阵
+                    cos_yaw = np.cos(yaw_per_frame)
+                    sin_yaw = np.sin(yaw_per_frame)
+                    # 前进
+                    forward_speed = 0.00
+                    pose = np.eye(4, dtype=np.float32)
+                    pose[0, 0] = cos_yaw
+                    pose[0, 2] = sin_yaw
+                    pose[2, 0] = -sin_yaw
+                    pose[2, 2] = cos_yaw
+                    pose[2, 3] = -forward_speed
+                elif i < CONDITION_FRAMES+STAGE_1+STAGE_2:
+                    # 右转
+                    yaw_per_frame = -0.03
+                    # 旋转矩阵
+                    cos_yaw = np.cos(yaw_per_frame)
+                    sin_yaw = np.sin(yaw_per_frame)
+                    # 前进
+                    forward_speed = 0.00
+                    pose = np.eye(4, dtype=np.float32)
+                    pose[0, 0] = cos_yaw
+                    pose[0, 2] = sin_yaw
+                    pose[2, 0] = -sin_yaw
+                    pose[2, 2] = cos_yaw
+                    pose[2, 3] = -forward_speed
+                else:
+                    # 超出条件帧与目标帧的部分，保持静止
+                    pose = np.eye(4, dtype=np.float32)
+                relative_pose = pose[:3, :]
+                relative_poses.append(torch.as_tensor(relative_pose))
+            else:
+                raise ValueError(f"未定义的相机运动方向: {direction}")
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + initial_condition_frames + 1, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 Sekai合成camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def generate_openx_camera_embeddings_sliding(
+    encoded_data, start_frame, initial_condition_frames, new_frames, use_real_poses):
+    """为OpenX数据集生成camera embeddings - 滑动窗口版本"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if use_real_poses and encoded_data is not None and 'cam_emb' in encoded_data and 'extrinsic' in encoded_data['cam_emb']:
+        print("🔧 使用OpenX真实camera数据")
+        cam_extrinsic = encoded_data['cam_emb']['extrinsic']
+        # 确保生成足够长的camera序列
+        max_needed_frames = max(
+            start_frame + initial_condition_frames + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 计算OpenX camera序列长度:")
+        print(f"  - 基础需求: {start_frame + initial_condition_frames + new_frames}")
+        print(f"  - FramePack需求: {framepack_needed_frames}")
+        print(f"  - 最终生成: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # OpenX使用4倍间隔，类似sekai但处理更短的序列
+            frame_idx = i * time_compression_ratio
+            next_frame_idx = frame_idx + time_compression_ratio
+            if next_frame_idx < len(cam_extrinsic):
+                cam_prev = cam_extrinsic[frame_idx]
+                cam_next = cam_extrinsic[next_frame_idx]
+                relative_pose = compute_relative_pose(cam_prev, cam_next)
+                relative_poses.append(torch.as_tensor(relative_pose[:3, :]))
+            else:
+                # 超出范围，使用零运动
+                print(f"⚠️ 帧{frame_idx}超出OpenX camera数据范围，使用零运动")
+                relative_poses.append(torch.zeros(3, 4))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        # 从start_frame到start_frame + initial_condition_frames标记为condition
+        condition_end = min(start_frame + initial_condition_frames, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 OpenX真实camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用OpenX合成camera数据")
+        max_needed_frames = max(
+            start_frame + initial_condition_frames + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 生成OpenX合成camera帧数: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # OpenX机器人操作运动模式 - 较小的运动幅度
+            # 模拟机器人手臂的精细操作运动
+            roll_per_frame = 0.02   # 轻微翻滚
+            pitch_per_frame = 0.01  # 轻微俯仰
+            yaw_per_frame = 0.015   # 轻微偏航
+            forward_speed = 0.003   # 较慢的前进速度
+            pose = np.eye(4, dtype=np.float32)
+            # 复合旋转 - 模拟机器人手臂的复杂运动
+            # 绕X轴旋转（roll）
+            cos_roll = np.cos(roll_per_frame)
+            sin_roll = np.sin(roll_per_frame)
+            # 绕Y轴旋转（pitch��
+            cos_pitch = np.cos(pitch_per_frame)
+            sin_pitch = np.sin(pitch_per_frame)
+            # 绕Z轴旋转（yaw）
+            cos_yaw = np.cos(yaw_per_frame)
+            sin_yaw = np.sin(yaw_per_frame)
+            # 简化的复合旋转矩阵（ZYX顺序）
+            pose[0, 0] = cos_yaw * cos_pitch
+            pose[0, 1] = cos_yaw * sin_pitch * sin_roll - sin_yaw * cos_roll
+            pose[0, 2] = cos_yaw * sin_pitch * cos_roll + sin_yaw * sin_roll
+            pose[1, 0] = sin_yaw * cos_pitch
+            pose[1, 1] = sin_yaw * sin_pitch * sin_roll + cos_yaw * cos_roll
+            pose[1, 2] = sin_yaw * sin_pitch * cos_roll - cos_yaw * sin_roll
+            pose[2, 0] = -sin_pitch
+            pose[2, 1] = cos_pitch * sin_roll
+            pose[2, 2] = cos_pitch * cos_roll
+            # 平移 - 模拟机器人操作的精细移动
+            pose[0, 3] = forward_speed * 0.5   # X方向轻微移动
+            pose[1, 3] = forward_speed * 0.3   # Y方向轻微移动
+            pose[2, 3] = -forward_speed        # Z方向（深度）主要移动
+            relative_pose = pose[:3, :]
+            relative_poses.append(torch.as_tensor(relative_pose))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + initial_condition_frames, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 OpenX合成camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def generate_nuscenes_camera_embeddings_sliding(
+    scene_info, start_frame, initial_condition_frames, new_frames):
+    """为NuScenes数据集生成camera embeddings - 滑动窗口版本 - 修正版，与train_moe.py保持一致"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if scene_info is not None and 'keyframe_poses' in scene_info:
+        print("🔧 使用NuScenes真实pose数据")
+        keyframe_poses = scene_info['keyframe_poses']
+        if len(keyframe_poses) == 0:
+            print("⚠️ NuScenes keyframe_poses为空，使用零pose")
+            max_needed_frames = max(framepack_needed_frames, 30)
+            pose_sequence = torch.zeros(max_needed_frames, 7, dtype=torch.float32)
+            mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+            condition_end = min(start_frame + initial_condition_frames, max_needed_frames)
+            mask[start_frame:condition_end] = 1.0
+            camera_embedding = torch.cat([pose_sequence, mask], dim=1)  # [max_needed_frames, 8]
+            print(f"🔧 NuScenes零pose embedding shape: {camera_embedding.shape}")
+            return camera_embedding.to(torch.bfloat16)
+        # 使用第一个pose作为参考
+        reference_pose = keyframe_poses[0]
+        max_needed_frames = max(framepack_needed_frames, 30)
+        pose_vecs = []
+        for i in range(max_needed_frames):
+            if i < len(keyframe_poses):
+                current_pose = keyframe_poses[i]
+                # 计算相对位移
+                translation = torch.tensor(
+                    np.array(current_pose['translation']) - np.array(reference_pose['translation']),
+                    dtype=torch.float32
+                )
+                # 计算相对旋转（简化版本）
+                rotation = torch.tensor(current_pose['rotation'], dtype=torch.float32)
+                pose_vec = torch.cat([translation, rotation], dim=0)  # [7D]
+            else:
+                # 超出范围，使用零pose
+                pose_vec = torch.cat([
+                    torch.zeros(3, dtype=torch.float32),
+                    torch.tensor([1.0, 0.0, 0.0, 0.0], dtype=torch.float32)
+                ], dim=0)  # [7D]
+            pose_vecs.append(pose_vec)
+        pose_sequence = torch.stack(pose_vecs, dim=0)  # [max_needed_frames, 7]
+        # 创建mask
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + initial_condition_frames, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_sequence, mask], dim=1)  # [max_needed_frames, 8]
+        print(f"🔧 NuScenes真实pose embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用NuScenes合成pose数据")
+        max_needed_frames = max(framepack_needed_frames, 30)
+        # 创建合成运动序列
+        pose_vecs = []
+        for i in range(max_needed_frames):
+            # 左转运动模式 - 类似城市驾驶中的左转弯
+            angle = i * 0.04  # 每帧转动0.08弧度（稍微慢一点的转弯）
+            radius = 15.0     # 较大的转弯半径，更符合汽车转弯
+            # 计算圆弧轨迹上的位置
+            x = radius * np.sin(angle)
+            y = 0.0  # 保持水平面运动
+            z = radius * (1 - np.cos(angle))
+            translation = torch.tensor([x, y, z], dtype=torch.float32)
+            # 车辆朝向 - 始终沿着轨迹切线方向
+            yaw = angle + np.pi/2  # 相对于初始前进方向的偏航角
+            # 四元数表示绕Y轴的旋转
+            rotation = torch.tensor([
+                np.cos(yaw/2),  # w (实部)
+                0.0,            # x
+                0.0,            # y
+                np.sin(yaw/2)   # z (虚部，绕Y轴)
+            ], dtype=torch.float32)
+            pose_vec = torch.cat([translation, rotation], dim=0)  # [7D: tx,ty,tz,qw,qx,qy,qz]
+            pose_vecs.append(pose_vec)
+        pose_sequence = torch.stack(pose_vecs, dim=0)
+        # 创建mask
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + initial_condition_frames, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_sequence, mask], dim=1)  # [max_needed_frames, 8]
+        print(f"🔧 NuScenes合成左转pose embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def prepare_framepack_sliding_window_with_camera_moe(
+    history_latents,
+    target_frames_to_generate,
+    camera_embedding_full,
+    start_frame,
+    modality_type,
+    max_history_frames=49):
+    """FramePack滑动窗口机制 - MoE版本"""
+    # history_latents: [C, T, H, W] 当前的历史latents
+    C, T, H, W = history_latents.shape
+    # 固定索引结构（这决定了需要的camera帧数）
+    # 1帧起始 + 16帧4x + 2帧2x + 1帧1x + target_frames_to_generate
+    total_indices_length = 1 + 16 + 2 + 1 + target_frames_to_generate
+    indices = torch.arange(0, total_indices_length)
+    split_sizes = [1, 16, 2, 1, target_frames_to_generate]
+    clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = \
+        indices.split(split_sizes, dim=0)
+    clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=0)
+    # 检查camera长度是否足够
+    if camera_embedding_full.shape[0] < total_indices_length:
+        print(f"⚠️ camera_embedding长度不足，进行零补齐: 当前长度 {camera_embedding_full.shape[0]}, 需要长度 {total_indices_length}")
+        shortage = total_indices_length - camera_embedding_full.shape[0]
+        padding = torch.zeros(shortage, camera_embedding_full.shape[1],
+                            dtype=camera_embedding_full.dtype, device=camera_embedding_full.device)
+        camera_embedding_full = torch.cat([camera_embedding_full, padding], dim=0)
+    # 从完整camera序列中选取对应部分
+    combined_camera = torch.zeros(
+        total_indices_length,
+        camera_embedding_full.shape[1],
+        dtype=camera_embedding_full.dtype,
+        device=camera_embedding_full.device)
+    # 历史条件帧的相机位姿
+    history_slice = camera_embedding_full[max(T - 19, 0):T, :].clone()
+    combined_camera[19 - history_slice.shape[0]:19, :] = history_slice
+    # 目标帧的相机位姿
+    target_slice = camera_embedding_full[T:T + target_frames_to_generate, :].clone()
+    combined_camera[19:19 + target_slice.shape[0], :] = target_slice
+    # 根据当前history length重新设置mask
+    combined_camera[:, -1] = 0.0  # 先全部设为target (0)
+    # 设置condition mask：前19帧根据实际历史长度决定
+    if T > 0:
+        available_frames = min(T, 19)
+        start_pos = 19 - available_frames
+        combined_camera[start_pos:19, -1] = 1.0  # 将有效的clean latents对应的camera标记为condition
+    print(f"🔧 MoE Camera mask更新:")
+    print(f"  - 历史帧数: {T}")
+    print(f"  - 有效condition帧数: {available_frames if T > 0 else 0}")
+    print(f"  - 模态类型: {modality_type}")
+    # 处理latents
+    clean_latents_combined = torch.zeros(C, 19, H, W, dtype=history_latents.dtype, device=history_latents.device)
+    if T > 0:
+        available_frames = min(T, 19)
+        start_pos = 19 - available_frames
+        clean_latents_combined[:, start_pos:, :, :] = history_latents[:, -available_frames:, :, :]
+    clean_latents_4x = clean_latents_combined[:, 0:16, :, :]
+    clean_latents_2x = clean_latents_combined[:, 16:18, :, :]
+    clean_latents_1x = clean_latents_combined[:, 18:19, :, :]
+    if T > 0:
+        start_latent = history_latents[:, 0:1, :, :]
+    else:
+        start_latent = torch.zeros(C, 1, H, W, dtype=history_latents.dtype, device=history_latents.device)
+    clean_latents = torch.cat([start_latent, clean_latents_1x], dim=1)
+    return {
+        'latent_indices': latent_indices,
+        'clean_latents': clean_latents,
+        'clean_latents_2x': clean_latents_2x,
+        'clean_latents_4x': clean_latents_4x,
+        'clean_latent_indices': clean_latent_indices,
+        'clean_latent_2x_indices': clean_latent_2x_indices,
+        'clean_latent_4x_indices': clean_latent_4x_indices,
+        'camera_embedding': combined_camera,
+        'modality_type': modality_type,  # 新增模态类型信息
+        'current_length': T,
+        'next_length': T + target_frames_to_generate
+    }
+def overlay_controls(frame_img, pose_vec, icons):
+    """
+    根据相机位姿在帧上叠加控制图标(WASD 和箭头)
+    pose_vec: 12 个元素（展平的 3x4 矩阵）+ mask
+    """
+    if pose_vec is None or np.all(pose_vec[:12] == 0):
+        return frame_img
+    # 提取平移向量（基于展平的 3x4 矩阵的索引）
+    # [r00, r01, r02, tx, r10, r11, r12, ty, r20, r21, r22, tz]
+    tx = pose_vec[3]
+    # ty = pose_vec[7]
+    tz = pose_vec[11]
+    # 提取旋转（偏航和俯仰）
+    # 偏航：绕 Y 轴。sin(偏航) = r02, cos(偏航) = r00
+    r00 = pose_vec[0]
+    r02 = pose_vec[2]
+    yaw = np.arctan2(r02, r00)
+    # 俯仰：绕 X 轴。sin(俯仰) = -r12, cos(俯仰) = r22
+    r12 = pose_vec[6]
+    r22 = pose_vec[10]
+    pitch = np.arctan2(-r12, r22)
+    # 按键激活的阈值
+    TRANS_THRESH = 0.01
+    ROT_THRESH = 0.005
+    # 确定按键状态
+    # 平移（WASD）
+    # 假设 -Z 为前进，+X 为右
+    is_forward = tz < -TRANS_THRESH
+    is_backward = tz > TRANS_THRESH
+    is_left = tx < -TRANS_THRESH
+    is_right = tx > TRANS_THRESH
+    # 旋转（箭头）
+    # 偏航：+ 为左，- 为右
+    is_turn_left = yaw > ROT_THRESH
+    is_turn_right = yaw < -ROT_THRESH
+    # 俯仰：+ 为下，- 为上
+    is_turn_up = pitch < -ROT_THRESH
+    is_turn_down = pitch > ROT_THRESH
+    W, H = frame_img.size
+    spacing = 60
+    def paste_icon(name_active, name_inactive, is_active, x, y):
+        name = name_active if is_active else name_inactive
+        if name in icons:
+            icon = icons[name]
+        # 使用 alpha 通道粘贴
+            frame_img.paste(icon, (int(x), int(y)), icon)
+    # 叠加 WASD（左下角）
+    base_x_right = 100
+    base_y = H - 100
+    # W
+    paste_icon('move_forward.png', 'not_move_forward.png', is_forward, base_x_right, base_y - spacing)
+    # A
+    paste_icon('move_left.png', 'not_move_left.png', is_left, base_x_right - spacing, base_y)
+    # S
+    paste_icon('move_backward.png', 'not_move_backward.png', is_backward, base_x_right, base_y)
+    # D
+    paste_icon('move_right.png', 'not_move_right.png', is_right, base_x_right + spacing, base_y)
+    # 叠加 ↑↓←→（右下角）
+    base_x_left = W - 150
+    # ↑
+    paste_icon('turn_up.png', 'not_turn_up.png', is_turn_up, base_x_left, base_y - spacing)
+    # ←
+    paste_icon('turn_left.png', 'not_turn_left.png', is_turn_left, base_x_left - spacing, base_y)
+    # ↓
+    paste_icon('turn_down.png', 'not_turn_down.png', is_turn_down, base_x_left, base_y)
+    # →
+    paste_icon('turn_right.png', 'not_turn_right.png', is_turn_right, base_x_left + spacing, base_y)
+    return frame_img
+def inference_moe_framepack_sliding_window(
+    condition_pth_path,
+    dit_path,
+    output_path="../examples/output_videos/output_moe_framepack_sliding.mp4",
+    start_frame=0,
+    initial_condition_frames=8,
+    frames_per_generation=4,
+    total_frames_to_generate=32,
+    max_history_frames=49,
+    device="cuda",
+    prompt="A video of a scene shot using a pedestrian's front camera while walking",
+    modality_type="sekai",  # "sekai" 或 "nuscenes"
+    use_real_poses=True,
+    scene_info_path=None,  # 对于NuScenes数据集
+    # CFG参数
+    use_camera_cfg=True,
+    camera_guidance_scale=2.0,
+    text_guidance_scale=1.0,
+    # MoE参数
+    moe_num_experts=4,
+    moe_top_k=2,
+    moe_hidden_dim=None,
+    direction="left",
+    use_gt_prompt=True,
+    add_icons=False
+):
+    """
+    MoE FramePack滑动窗口视频生成 - 支持多模态
+    """
+    # 创建输出目录
+    dir_path = os.path.dirname(output_path)
+    os.makedirs(dir_path, exist_ok=True)
+    print(f"🔧 MoE FramePack滑动窗口生成开始...")
+    print(f"模态类型: {modality_type}")
+    print(f"Camera CFG: {use_camera_cfg}, Camera guidance scale: {camera_guidance_scale}")
+    print(f"Text guidance scale: {text_guidance_scale}")
+    print(f"MoE配置: experts={moe_num_experts}, top_k={moe_top_k}")
+    # 1. 模型初始化
+    replace_dit_model_in_manager()
+    model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+    model_manager.load_models([
+        "/mnt/data/louis_crq/models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
+        "/mnt/data/louis_crq/models/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
+        "/mnt/data/louis_crq/models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+    ])
+    pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager, device="cuda")
+    # 2. 添加传统camera编码器（兼容性）
+    dim = pipe.dit.blocks[0].self_attn.q.weight.shape[0]
+    for block in pipe.dit.blocks:
+        block.cam_encoder = nn.Linear(13, dim)
+        block.projector = nn.Linear(dim, dim)
+        block.cam_encoder.weight.data.zero_()
+        block.cam_encoder.bias.data.zero_()
+        block.projector.weight = nn.Parameter(torch.eye(dim))
+        block.projector.bias = nn.Parameter(torch.zeros(dim))
+    # 3. 添加FramePack组件
+    add_framepack_components(pipe.dit)
+    # 4. 添加MoE组件
+    moe_config = {
+        "num_experts": moe_num_experts,
+        "top_k": moe_top_k,
+        "hidden_dim": moe_hidden_dim or dim * 2,
+        "sekai_input_dim": 13,    # Sekai: 12维pose + 1维mask
+        "nuscenes_input_dim": 8,   # NuScenes: 7维pose + 1维mask
+        "openx_input_dim": 13       # OpenX: 12维pose + 1维mask (类似sekai)
+    }
+    add_moe_components(pipe.dit, moe_config)
+    # 5. 加载训练好的权重
+    dit_state_dict = torch.load(dit_path, map_location="cpu")
+    pipe.dit.load_state_dict(dit_state_dict, strict=False)  # 使用strict=False以兼容新增的MoE组件
+    pipe = pipe.to(device)
+    model_dtype = next(pipe.dit.parameters()).dtype
+    if hasattr(pipe.dit, 'clean_x_embedder'):
+        pipe.dit.clean_x_embedder = pipe.dit.clean_x_embedder.to(dtype=model_dtype)
+    # 设置去噪步数
+    pipe.scheduler.set_timesteps(50)
+    # 6. 加载初始条件
+    print("Loading initial condition frames...")
+    initial_latents, encoded_data = load_encoded_video_from_pth(
+        condition_pth_path,
+        start_frame=start_frame,
+        num_frames=initial_condition_frames
+    )
+    # 空间裁剪
+    target_height, target_width = 60, 104
+    C, T, H, W = initial_latents.shape
+    if H > target_height or W > target_width:
+        h_start = (H - target_height) // 2
+        w_start = (W - target_width) // 2
+        initial_latents = initial_latents[:, :, h_start:h_start+target_height, w_start:w_start+target_width]
+        H, W = target_height, target_width
+    history_latents = initial_latents.to(device, dtype=model_dtype)
+    print(f"初始history_latents shape: {history_latents.shape}")
+    # 7. 编码prompt - 支持CFG
+    if use_gt_prompt and 'prompt_emb' in encoded_data:
+        print("✅ 使用预编码的GT prompt embedding")
+        prompt_emb_pos = encoded_data['prompt_emb']
+        # 将prompt_emb移到正确的设备和数据类型
+        if 'context' in prompt_emb_pos:
+            prompt_emb_pos['context'] = prompt_emb_pos['context'].to(device, dtype=model_dtype)
+        if 'context_mask' in prompt_emb_pos:
+            prompt_emb_pos['context_mask'] = prompt_emb_pos['context_mask'].to(device, dtype=model_dtype)
+        # 如果使用Text CFG，生成负向prompt
+        if text_guidance_scale > 1.0:
+            prompt_emb_neg = pipe.encode_prompt("")
+            print(f"使用Text CFG with GT prompt，guidance scale: {text_guidance_scale}")
+        else:
+            prompt_emb_neg = None
+            print("不使用Text CFG")
+        # 🔧 打印GT prompt文本（如果有）
+        if 'prompt' in encoded_data['prompt_emb']:
+            gt_prompt_text = encoded_data['prompt_emb']['prompt']
+            print(f"📝 GT Prompt文本: {gt_prompt_text}")
+    else:
+        # 使用传入的prompt参数重新编码
+        print(f"🔄 重新编码prompt: {prompt}")
+        if text_guidance_scale > 1.0:
+            prompt_emb_pos = pipe.encode_prompt(prompt)
+            prompt_emb_neg = pipe.encode_prompt("")
+            print(f"使用Text CFG，guidance scale: {text_guidance_scale}")
+        else:
+            prompt_emb_pos = pipe.encode_prompt(prompt)
+            prompt_emb_neg = None
+            print("不使用Text CFG")
+    # 8. 加载场景信息（对于NuScenes）
+    scene_info = None
+    if modality_type == "nuscenes" and scene_info_path and os.path.exists(scene_info_path):
+        with open(scene_info_path, 'r') as f:
+            scene_info = json.load(f)
+        print(f"加载NuScenes场景信息: {scene_info_path}")
+    # 9. 预生成完整的camera embedding序列
+    if modality_type == "sekai":
+        camera_embedding_full = generate_sekai_camera_embeddings_sliding(
+            encoded_data.get('cam_emb', None),
+            start_frame,
+            initial_condition_frames,
+            total_frames_to_generate,
+            0,
+            use_real_poses=use_real_poses,
+            direction=direction
+        ).to(device, dtype=model_dtype)
+    elif modality_type == "nuscenes":
+        camera_embedding_full = generate_nuscenes_camera_embeddings_sliding(
+            scene_info,
+            start_frame,
+            initial_condition_frames,
+            total_frames_to_generate
+        ).to(device, dtype=model_dtype)
+    elif modality_type == "openx":
+        camera_embedding_full = generate_openx_camera_embeddings_sliding(
+            encoded_data,
+            start_frame,
+            initial_condition_frames,
+            total_frames_to_generate,
+            use_real_poses=use_real_poses
+        ).to(device, dtype=model_dtype)
+    else:
+        raise ValueError(f"不支持的模态类型: {modality_type}")
+    print(f"完整camera序列shape: {camera_embedding_full.shape}")
+    # 10. 为Camera CFG创建无条件的camera embedding
+    if use_camera_cfg:
+        camera_embedding_uncond = torch.zeros_like(camera_embedding_full)
+        print(f"创建无条件camera embedding用于CFG")
+    # 11. 滑动窗口生成循环
+    total_generated = 0
+    all_generated_frames = []
+    while total_generated < total_frames_to_generate:
+        current_generation = min(frames_per_generation, total_frames_to_generate - total_generated)
+        print(f"\n🔧 生成步骤 {total_generated // frames_per_generation + 1}")
+        print(f"当前历史长度: {history_latents.shape[1]}, 本次生成: {current_generation}")
+        # FramePack数据准备 - MoE版本
+        framepack_data = prepare_framepack_sliding_window_with_camera_moe(
+            history_latents,
+            current_generation,
+            camera_embedding_full,
+            start_frame,
+            modality_type,
+            max_history_frames
+        )
+        # 准备输入
+        clean_latents = framepack_data['clean_latents'].unsqueeze(0)
+        clean_latents_2x = framepack_data['clean_latents_2x'].unsqueeze(0)
+        clean_latents_4x = framepack_data['clean_latents_4x'].unsqueeze(0)
+        camera_embedding = framepack_data['camera_embedding'].unsqueeze(0)
+        # 准备modality_inputs
+        modality_inputs = {modality_type: camera_embedding}
+        # 为CFG准备无条件camera embedding
+        if use_camera_cfg:
+            camera_embedding_uncond_batch = camera_embedding_uncond[:camera_embedding.shape[1], :].unsqueeze(0)
+            modality_inputs_uncond = {modality_type: camera_embedding_uncond_batch}
+        # 索引处理
+        latent_indices = framepack_data['latent_indices'].unsqueeze(0).cpu()
+        clean_latent_indices = framepack_data['clean_latent_indices'].unsqueeze(0).cpu()
+        clean_latent_2x_indices = framepack_data['clean_latent_2x_indices'].unsqueeze(0).cpu()
+        clean_latent_4x_indices = framepack_data['clean_latent_4x_indices'].unsqueeze(0).cpu()
+        # 初始化要生成的latents
+        new_latents = torch.randn(
+            1, C, current_generation, H, W,
+            device=device, dtype=model_dtype
+        )
+        extra_input = pipe.prepare_extra_input(new_latents)
+        print(f"Camera embedding shape: {camera_embedding.shape}")
+        print(f"Camera mask分布 - condition: {torch.sum(camera_embedding[0, :, -1] == 1.0).item()}, target: {torch.sum(camera_embedding[0, :, -1] == 0.0).item()}")
+        # 去噪循环 - 支持CFG
+        timesteps = pipe.scheduler.timesteps
+        for i, timestep in enumerate(timesteps):
+            if i % 10 == 0:
+                print(f"  去噪步骤 {i+1}/{len(timesteps)}")
+            timestep_tensor = timestep.unsqueeze(0).to(device, dtype=model_dtype)
+            with torch.no_grad():
+                # CFG推理
+                if use_camera_cfg and camera_guidance_scale > 1.0:
+                    # 条件预测（有camera）
+                    noise_pred_cond, moe_loess = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,  # MoE模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+                    # 无条件预测（无camera）
+                    noise_pred_uncond, moe_loess = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding_uncond_batch,
+                        modality_inputs=modality_inputs_uncond,  # MoE无条件模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **(prompt_emb_neg if prompt_emb_neg else prompt_emb_pos),
+                        **extra_input
+                    )
+                    # Camera CFG
+                    noise_pred = noise_pred_uncond + camera_guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    # 如果同时使用Text CFG
+                    if text_guidance_scale > 1.0 and prompt_emb_neg:
+                        noise_pred_text_uncond, moe_loess = pipe.dit(
+                            new_latents,
+                            timestep=timestep_tensor,
+                            cam_emb=camera_embedding,
+                            modality_inputs=modality_inputs,
+                            latent_indices=latent_indices,
+                            clean_latents=clean_latents,
+                            clean_latent_indices=clean_latent_indices,
+                            clean_latents_2x=clean_latents_2x,
+                            clean_latent_2x_indices=clean_latent_2x_indices,
+                            clean_latents_4x=clean_latents_4x,
+                            clean_latent_4x_indices=clean_latent_4x_indices,
+                            **prompt_emb_neg,
+                            **extra_input
+                        )
+                        # 应用Text CFG到已经应用Camera CFG的结果
+                        noise_pred = noise_pred_text_uncond + text_guidance_scale * (noise_pred - noise_pred_text_uncond)
+                elif text_guidance_scale > 1.0 and prompt_emb_neg:
+                    # 只使用Text CFG
+                    noise_pred_cond, moe_loess = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+                    noise_pred_uncond, moe_loess= pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_neg,
+                        **extra_input
+                    )
+                    noise_pred = noise_pred_uncond + text_guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                else:
+                    # 标准推理（无CFG）
+                    noise_pred, moe_loess = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,  # MoE模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+            new_latents = pipe.scheduler.step(noise_pred, timestep, new_latents)
+        # 更新历史
+        new_latents_squeezed = new_latents.squeeze(0)
+        history_latents = torch.cat([history_latents, new_latents_squeezed], dim=1)
+        # 维护滑动窗口
+        if history_latents.shape[1] > max_history_frames:
+            first_frame = history_latents[:, 0:1, :, :]
+            recent_frames = history_latents[:, -(max_history_frames-1):, :, :]
+            history_latents = torch.cat([first_frame, recent_frames], dim=1)
+            print(f"历史窗口已满，保留第一帧+最新{max_history_frames-1}帧")
+        print(f"更新后history_latents shape: {history_latents.shape}")
+        all_generated_frames.append(new_latents_squeezed)
+        total_generated += current_generation
+        print(f"✅ 已生成 {total_generated}/{total_frames_to_generate} 帧")
+    # 12. 解码和保存
+    print("\n🔧 解码生成的视频...")
+    all_generated = torch.cat(all_generated_frames, dim=1)
+    final_video = torch.cat([initial_latents.to(all_generated.device), all_generated], dim=1).unsqueeze(0)
+    print(f"最终视频shape: {final_video.shape}")
+    decoded_video = pipe.decode_video(final_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16))
+    print(f"Saving video to {output_path} ...")
+    video_np = decoded_video[0].to(torch.float32).permute(1, 2, 3, 0).cpu().numpy()
+    video_np = (video_np * 0.5 + 0.5).clip(0, 1)
+    video_np = (video_np * 255).astype(np.uint8)
+    icons = {}
+    video_camera_poses = None
+    if add_icons:
+        # 加载用于叠加的图标资源
+        icons_dir = os.path.join(ROOT_DIR, 'icons')
+        icon_names = ['move_forward.png', 'not_move_forward.png',
+                      'move_backward.png', 'not_move_backward.png',
+                      'move_left.png', 'not_move_left.png',
+                      'move_right.png', 'not_move_right.png',
+                      'turn_up.png', 'not_turn_up.png',
+                      'turn_down.png', 'not_turn_down.png',
+                      'turn_left.png', 'not_turn_left.png',
+                      'turn_right.png', 'not_turn_right.png']
+        for name in icon_names:
+            path = os.path.join(icons_dir, name)
+            if os.path.exists(path):
+                try:
+                    icon = Image.open(path).convert("RGBA")
+                    # 调整图标尺寸
+                    icon = icon.resize((50, 50), Image.Resampling.LANCZOS)
+                    icons[name] = icon
+                except Exception as e:
+                    print(f"Error loading icon {name}: {e}")
+            else:
+                print(f"Warning: Icon {name} not found at {path}")
+        # 获取与视频帧对应的相机姿态
+        time_compression_ratio = 4
+        camera_poses = camera_embedding_full.detach().float().cpu().numpy()
+        video_camera_poses = [x for x in camera_poses for _ in range(time_compression_ratio)]
+    with imageio.get_writer(output_path, fps=20) as writer:
+        for i, frame in enumerate(video_np):
+            # Convert to PIL for overlay
+            img = Image.fromarray(frame)
+            if add_icons and video_camera_poses is not None and icons:
+                # Video frame i corresponds to camera_embedding_full[start_frame + i]
+                pose_idx = start_frame + i
+                if pose_idx < len(video_camera_poses):
+                    pose_vec = video_camera_poses[pose_idx]
+                    img = overlay_controls(img, pose_vec, icons)
+            writer.append_data(np.array(img))
+    print(f"🔧 MoE FramePack滑动窗口生成完成! 保存到: {output_path}")
+    print(f"总共生成了 {total_generated} 帧 (压缩后), 对应原始 {total_generated * 4} 帧")
+    print(f"使用模态: {modality_type}")
+def main():
+    parser = argparse.ArgumentParser(description="MoE FramePack滑动窗口视频生成 - 支持多模态")
+    # 基础参数
+    parser.add_argument("--condition_pth", type=str,
+                        default="../examples/condition_pth/garden_1.pth")
+    parser.add_argument("--start_frame", type=int, default=0)
+    parser.add_argument("--initial_condition_frames", type=int, default=1)
+    parser.add_argument("--frames_per_generation", type=int, default=8)
+    parser.add_argument("--total_frames_to_generate", type=int, default=24)
+    parser.add_argument("--max_history_frames", type=int, default=100)
+    parser.add_argument("--use_real_poses", default=False)
+    parser.add_argument("--dit_path", type=str, default=None, required=True,
+                        help="path to the pretrained DiT MoE model checkpoint")
+    parser.add_argument("--output_path", type=str,
+                       default='./examples/output_videos/output_moe_framepack_sliding.mp4')
+    parser.add_argument("--prompt", type=str, default=None,
+                        help="text prompt for video generation")
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--add_icons", action="store_true", default=False,
+                        help="在生成的视频上叠加控制图标")
+    # 模态类型参数
+    parser.add_argument("--modality_type", type=str, choices=["sekai", "nuscenes", "openx"],
+                       default="sekai", help="模态类型：sekai 或 nuscenes 或 openx")
+    parser.add_argument("--scene_info_path", type=str, default=None,
+                       help="NuScenes场景信息文件路径（仅用于nuscenes模态）")
+    # CFG参数
+    parser.add_argument("--use_camera_cfg", default=False,
+                       help="使用Camera CFG")
+    parser.add_argument("--camera_guidance_scale", type=float, default=2.0,
+                       help="Camera guidance scale for CFG")
+    parser.add_argument("--text_guidance_scale", type=float, default=1.0,
+                       help="Text guidance scale for CFG")
+    # MoE参数
+    parser.add_argument("--moe_num_experts", type=int, default=3, help="专家数量")
+    parser.add_argument("--moe_top_k", type=int, default=1, help="Top-K专家")
+    parser.add_argument("--moe_hidden_dim", type=int, default=None, help="MoE隐藏层维度")
+    parser.add_argument("--direction", type=str, default="left", help="生成视频的行进轨迹方向")
+    parser.add_argument("--use_gt_prompt", action="store_true", default=False,
+                       help="使用数据集中的ground truth prompt embedding")
+    args = parser.parse_args()
+    print(f"🔧 MoE FramePack CFG生成设置:")
+    print(f"模态类型: {args.modality_type}")
+    print(f"Camera CFG: {args.use_camera_cfg}")
+    if args.use_camera_cfg:
+        print(f"Camera guidance scale: {args.camera_guidance_scale}")
+    print(f"使用GT Prompt: {args.use_gt_prompt}")
+    print(f"Text guidance scale: {args.text_guidance_scale}")
+    print(f"MoE配置: experts={args.moe_num_experts}, top_k={args.moe_top_k}")
+    print(f"DiT{args.dit_path}")
+    # 验证NuScenes参数
+    if args.modality_type == "nuscenes" and not args.scene_info_path:
+        print("⚠️ 使用NuScenes模态但未提供scene_info_path，将使用合成pose数据")
+    inference_moe_framepack_sliding_window(
+        condition_pth_path=args.condition_pth,
+        dit_path=args.dit_path,
+        output_path=args.output_path,
+        start_frame=args.start_frame,
+        initial_condition_frames=args.initial_condition_frames,
+        frames_per_generation=args.frames_per_generation,
+        total_frames_to_generate=args.total_frames_to_generate,
+        max_history_frames=args.max_history_frames,
+        device=args.device,
+        prompt=args.prompt,
+        modality_type=args.modality_type,
+        use_real_poses=args.use_real_poses,
+        scene_info_path=args.scene_info_path,
+        # CFG参数
+        use_camera_cfg=args.use_camera_cfg,
+        camera_guidance_scale=args.camera_guidance_scale,
+        text_guidance_scale=args.text_guidance_scale,
+        # MoE参数
+        moe_num_experts=args.moe_num_experts,
+        moe_top_k=args.moe_top_k,
+        moe_hidden_dim=args.moe_hidden_dim,
+        direction=args.direction,
+        use_gt_prompt=args.use_gt_prompt,
+        add_icons=args.add_icons
+    )
+if __name__ == "__main__":
+    main()

scripts/infer_moe.py ADDED Viewed

	@@ -0,0 +1,1023 @@

+import os
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+import imageio
+import json
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import argparse
+from torchvision.transforms import v2
+from einops import rearrange
+import copy
+from scipy.spatial.transform import Rotation as R
+def compute_relative_pose_matrix(pose1, pose2):
+    """
+    计算相邻两帧的相对位姿，返回3×4的相机矩阵 [R_rel | t_rel]
+    参数:
+    pose1: 第i帧的相机位姿，形状为(7,)的数组 [tx1, ty1, tz1, qx1, qy1, qz1, qw1]
+    pose2: 第i+1帧的相机位姿，形状为(7,)的数组 [tx2, ty2, tz2, qx2, qy2, qz2, qw2]
+    返回:
+    relative_matrix: 3×4的相对位姿矩阵，前3列是旋转矩阵R_rel，第4列是平移向量t_rel
+    """
+    # 分离平移向量和四元数
+    t1 = pose1[:3]  # 第i帧平移 [tx1, ty1, tz1]
+    q1 = pose1[3:]  # 第i帧四元数 [qx1, qy1, qz1, qw1]
+    t2 = pose2[:3]  # 第i+1帧平移
+    q2 = pose2[3:]  # 第i+1帧四元数
+    # 1. 计算相对旋转矩阵 R_rel
+    rot1 = R.from_quat(q1)  # 第i帧旋转
+    rot2 = R.from_quat(q2)  # 第i+1帧旋转
+    rot_rel = rot2 * rot1.inv()  # 相对旋转 = 后一帧旋转 × 前一帧旋转的逆
+    R_rel = rot_rel.as_matrix()  # 转换为3×3矩阵
+    # 2. 计算相对平移向量 t_rel
+    R1_T = rot1.as_matrix().T  # 前一帧旋转矩阵的转置（等价于逆）
+    t_rel = R1_T @ (t2 - t1)   # 相对平移 = R1^T × (t2 - t1)
+    # 3. 组合为3×4矩阵 [R_rel | t_rel]
+    relative_matrix = np.hstack([R_rel, t_rel.reshape(3, 1)])
+    return relative_matrix
+def calculate_relative_rotation(current_rotation, reference_rotation):
+    """计算相对旋转四元数 - NuScenes专用"""
+    q_current = torch.tensor(current_rotation, dtype=torch.float32)
+    q_ref = torch.tensor(reference_rotation, dtype=torch.float32)
+    q_ref_inv = torch.tensor([q_ref[0], -q_ref[1], -q_ref[2], -q_ref[3]])
+    w1, x1, y1, z1 = q_ref_inv
+    w2, x2, y2, z2 = q_current
+    relative_rotation = torch.tensor([
+        w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2,
+        w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2,
+        w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2,
+        w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2
+    ])
+    return relative_rotation
+def load_encoded_video_from_pth(pth_path, start_frame=0, num_frames=10):
+    """从pth文件加载预编码的视频数据"""
+    print(f"Loading encoded video from {pth_path}")
+    encoded_data = torch.load(pth_path, weights_only=False, map_location="cpu")
+    full_latents = encoded_data['latents']  # [C, T, H, W]
+    print(f"Full latents shape: {full_latents.shape}")
+    print(f"Extracting frames {start_frame} to {start_frame + num_frames}")
+    if start_frame + num_frames > full_latents.shape[1]:
+        raise ValueError(f"Not enough frames: requested {start_frame + num_frames}, available {full_latents.shape[1]}")
+    condition_latents = full_latents[:, start_frame:start_frame + num_frames, :, :]
+    print(f"Extracted condition latents shape: {condition_latents.shape}")
+    return condition_latents, encoded_data
+def compute_relative_pose(pose_a, pose_b, use_torch=False):
+    """计算相机B相对于相机A的相对位姿矩阵"""
+    assert pose_a.shape == (4, 4), f"相机A外参矩阵形状应为(4,4)，实际为{pose_a.shape}"
+    assert pose_b.shape == (4, 4), f"相机B外参矩阵形状应为(4,4)，实际为{pose_b.shape}"
+    if use_torch:
+        if not isinstance(pose_a, torch.Tensor):
+            pose_a = torch.from_numpy(pose_a).float()
+        if not isinstance(pose_b, torch.Tensor):
+            pose_b = torch.from_numpy(pose_b).float()
+        pose_a_inv = torch.inverse(pose_a)
+        relative_pose = torch.matmul(pose_b, pose_a_inv)
+    else:
+        if not isinstance(pose_a, np.ndarray):
+            pose_a = np.array(pose_a, dtype=np.float32)
+        if not isinstance(pose_b, np.ndarray):
+            pose_b = np.array(pose_b, dtype=np.float32)
+        pose_a_inv = np.linalg.inv(pose_a)
+        relative_pose = np.matmul(pose_b, pose_a_inv)
+    return relative_pose
+def replace_dit_model_in_manager():
+    """替换DiT模型类为MoE版本"""
+    from diffsynth.models.wan_video_dit_moe import WanModelMoe
+    from diffsynth.configs.model_config import model_loader_configs
+    for i, config in enumerate(model_loader_configs):
+        keys_hash, keys_hash_with_shape, model_names, model_classes, model_resource = config
+        if 'wan_video_dit' in model_names:
+            new_model_names = []
+            new_model_classes = []
+            for name, cls in zip(model_names, model_classes):
+                if name == 'wan_video_dit':
+                    new_model_names.append(name)
+                    new_model_classes.append(WanModelMoe)
+                    print(f"✅ 替换了模型类: {name} -> WanModelMoe")
+                else:
+                    new_model_names.append(name)
+                    new_model_classes.append(cls)
+            model_loader_configs[i] = (keys_hash, keys_hash_with_shape, new_model_names, new_model_classes, model_resource)
+def add_framepack_components(dit_model):
+    """添加FramePack相关组件"""
+    if not hasattr(dit_model, 'clean_x_embedder'):
+        inner_dim = dit_model.blocks[0].self_attn.q.weight.shape[0]
+        class CleanXEmbedder(nn.Module):
+            def __init__(self, inner_dim):
+                super().__init__()
+                self.proj = nn.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
+                self.proj_2x = nn.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
+                self.proj_4x = nn.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
+            def forward(self, x, scale="1x"):
+                if scale == "1x":
+                    x = x.to(self.proj.weight.dtype)
+                    return self.proj(x)
+                elif scale == "2x":
+                    x = x.to(self.proj_2x.weight.dtype)
+                    return self.proj_2x(x)
+                elif scale == "4x":
+                    x = x.to(self.proj_4x.weight.dtype)
+                    return self.proj_4x(x)
+                else:
+                    raise ValueError(f"Unsupported scale: {scale}")
+        dit_model.clean_x_embedder = CleanXEmbedder(inner_dim)
+        model_dtype = next(dit_model.parameters()).dtype
+        dit_model.clean_x_embedder = dit_model.clean_x_embedder.to(dtype=model_dtype)
+        print("✅ 添加了FramePack的clean_x_embedder组件")
+def add_moe_components(dit_model, moe_config):
+    """🔧 添加MoE相关组件 - 修正版本"""
+    if not hasattr(dit_model, 'moe_config'):
+        dit_model.moe_config = moe_config
+        print("✅ 添加了MoE配置到模型")
+    dit_model.top_k = moe_config.get("top_k", 1)
+    # 为每个block动态添加MoE组件
+    dim = dit_model.blocks[0].self_attn.q.weight.shape[0]
+    unified_dim = moe_config.get("unified_dim", 25)
+    num_experts = moe_config.get("num_experts", 4)
+    from diffsynth.models.wan_video_dit_moe import ModalityProcessor, MultiModalMoE
+    dit_model.sekai_processor = ModalityProcessor("sekai", 13, unified_dim)
+    dit_model.nuscenes_processor = ModalityProcessor("nuscenes", 8, unified_dim)
+    dit_model.openx_processor = ModalityProcessor("openx", 13, unified_dim)  # OpenX使用13维输入，类似sekai但独立处理
+    dit_model.global_router = nn.Linear(unified_dim, num_experts)
+    for i, block in enumerate(dit_model.blocks):
+        # MoE网络 - 输入unified_dim，输出dim
+        block.moe = MultiModalMoE(
+            unified_dim=unified_dim,
+            output_dim=dim,  # 输出维度匹配transformer block的dim
+            num_experts=moe_config.get("num_experts", 4),
+            top_k=moe_config.get("top_k", 2)
+        )
+        print(f"✅ Block {i} 添加了MoE组件 (unified_dim: {unified_dim}, experts: {moe_config.get('num_experts', 4)})")
+def generate_sekai_camera_embeddings_sliding(cam_data, start_frame, current_history_length, new_frames, total_generated, use_real_poses=True):
+    """为Sekai数据集生成camera embeddings - 滑动窗口版本"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if use_real_poses and cam_data is not None and 'extrinsic' in cam_data:
+        print("🔧 使用真实Sekai camera数据")
+        cam_extrinsic = cam_data['extrinsic']
+        # 确保生成足够长的camera序列
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 计算Sekai camera序列长度:")
+        print(f"  - 基础需求: {start_frame + current_history_length + new_frames}")
+        print(f"  - FramePack需求: {framepack_needed_frames}")
+        print(f"  - 最终生成: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # 计算当前帧在原始序列中的位置
+            frame_idx = i * time_compression_ratio
+            next_frame_idx = frame_idx + time_compression_ratio
+            if next_frame_idx < len(cam_extrinsic):
+                cam_prev = cam_extrinsic[frame_idx]
+                cam_next = cam_extrinsic[next_frame_idx]
+                relative_pose = compute_relative_pose(cam_prev, cam_next)
+                relative_poses.append(torch.as_tensor(relative_pose[:3, :]))
+            else:
+                # 超出范围，使用零运动
+                print(f"⚠️ 帧{frame_idx}超出camera数据范围，使用零运动")
+                relative_poses.append(torch.zeros(3, 4))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        # 从start_frame到current_history_length标记为condition
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 Sekai真实camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用Sekai合成camera数据")
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 生成Sekai合成camera帧数: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # 持续左转运动模式
+            yaw_per_frame = -0.1  # 每帧左转（正角度表示左转）
+            forward_speed = 0.005  # 每帧前进距离
+            pose = np.eye(4, dtype=np.float32)
+            # 旋转矩阵（绕Y轴左转）
+            cos_yaw = np.cos(yaw_per_frame)
+            sin_yaw = np.sin(yaw_per_frame)
+            pose[0, 0] = cos_yaw
+            pose[0, 2] = sin_yaw
+            pose[2, 0] = -sin_yaw
+            pose[2, 2] = cos_yaw
+            # 平移（在旋转后的局部坐标系中前进）
+            pose[2, 3] = -forward_speed  # 局部Z轴负方向（前进）
+            # 添加轻微的向心运动，模拟圆形轨迹
+            radius_drift = 0.002  # 向圆心的轻微漂移
+            pose[0, 3] = radius_drift  # 局部X轴负方向（向左）
+            relative_pose = pose[:3, :]
+            relative_poses.append(torch.as_tensor(relative_pose))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 Sekai合成camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def generate_openx_camera_embeddings_sliding(encoded_data, start_frame, current_history_length, new_frames, use_real_poses):
+    """为OpenX数据集生成camera embeddings - 滑动窗口版本"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if use_real_poses and encoded_data is not None and 'cam_emb' in encoded_data and 'extrinsic' in encoded_data['cam_emb']:
+        print("🔧 使用OpenX真实camera数据")
+        cam_extrinsic = encoded_data['cam_emb']['extrinsic']
+        # 确保生成足够长的camera序列
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 计算OpenX camera序列长度:")
+        print(f"  - 基础需求: {start_frame + current_history_length + new_frames}")
+        print(f"  - FramePack需求: {framepack_needed_frames}")
+        print(f"  - 最终生成: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # OpenX使用4倍间隔，类似sekai但处理更短的序列
+            frame_idx = i * time_compression_ratio
+            next_frame_idx = frame_idx + time_compression_ratio
+            if next_frame_idx < len(cam_extrinsic):
+                cam_prev = cam_extrinsic[frame_idx]
+                cam_next = cam_extrinsic[next_frame_idx]
+                relative_pose = compute_relative_pose(cam_prev, cam_next)
+                relative_poses.append(torch.as_tensor(relative_pose[:3, :]))
+            else:
+                # 超出范围，使用零运动
+                print(f"⚠️ 帧{frame_idx}超出OpenX camera数据范围，使用零运动")
+                relative_poses.append(torch.zeros(3, 4))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        # 从start_frame到current_history_length标记为condition
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 OpenX真实camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用OpenX合成camera数据")
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 生成OpenX合成camera帧数: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # OpenX机器人操作运动模式 - 较小的运动幅度
+            # 模拟机器人手臂的精细操作运动
+            roll_per_frame = 0.02   # 轻微翻滚
+            pitch_per_frame = 0.01  # 轻微俯仰
+            yaw_per_frame = 0.015   # 轻微偏航
+            forward_speed = 0.003   # 较慢的前进速度
+            pose = np.eye(4, dtype=np.float32)
+            # 复合旋转 - 模拟机器人手臂的复杂运动
+            # 绕X轴旋转（roll）
+            cos_roll = np.cos(roll_per_frame)
+            sin_roll = np.sin(roll_per_frame)
+            # 绕Y轴旋转（pitch）
+            cos_pitch = np.cos(pitch_per_frame)
+            sin_pitch = np.sin(pitch_per_frame)
+            # 绕Z轴旋转（yaw）
+            cos_yaw = np.cos(yaw_per_frame)
+            sin_yaw = np.sin(yaw_per_frame)
+            # 简化的复合旋转矩阵（ZYX顺序）
+            pose[0, 0] = cos_yaw * cos_pitch
+            pose[0, 1] = cos_yaw * sin_pitch * sin_roll - sin_yaw * cos_roll
+            pose[0, 2] = cos_yaw * sin_pitch * cos_roll + sin_yaw * sin_roll
+            pose[1, 0] = sin_yaw * cos_pitch
+            pose[1, 1] = sin_yaw * sin_pitch * sin_roll + cos_yaw * cos_roll
+            pose[1, 2] = sin_yaw * sin_pitch * cos_roll - cos_yaw * sin_roll
+            pose[2, 0] = -sin_pitch
+            pose[2, 1] = cos_pitch * sin_roll
+            pose[2, 2] = cos_pitch * cos_roll
+            # 平移 - 模拟机器人操作的精细移动
+            pose[0, 3] = forward_speed * 0.5   # X方向轻微移动
+            pose[1, 3] = forward_speed * 0.3   # Y方向轻微移动
+            pose[2, 3] = -forward_speed        # Z方向（深度）主要移动
+            relative_pose = pose[:3, :]
+            relative_poses.append(torch.as_tensor(relative_pose))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 OpenX合成camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def generate_nuscenes_camera_embeddings_sliding(scene_info, start_frame, current_history_length, new_frames):
+    """为NuScenes数据集生成camera embeddings - 滑动窗口版本 - 修正版，与train_moe.py保持一致"""
+    time_compression_ratio = 4
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    max_needed_frames = max(framepack_needed_frames, 30)
+    if scene_info is not None and 'keyframe_poses' in scene_info:
+        print("🔧 使用NuScenes真实pose数据")
+        keyframe_poses = scene_info['keyframe_poses']
+        # 生成所有需要的关键帧索引
+        keyframe_indices = []
+        for i in range(max_needed_frames + 1):  # +1是因为需要前后两帧
+            idx = (start_frame + i) * time_compression_ratio
+            keyframe_indices.append(idx)
+        keyframe_indices = [min(idx, len(keyframe_poses)-1) for idx in keyframe_indices]
+        pose_vecs = []
+        for i in range(max_needed_frames):
+            pose_prev = keyframe_poses[keyframe_indices[i]]
+            pose_next = keyframe_poses[keyframe_indices[i+1]]
+            # 计算相对位移
+            translation = torch.tensor(
+                np.array(pose_next['translation']) - np.array(pose_prev['translation']),
+                dtype=torch.float32
+            )
+            # 计算相对旋转
+            relative_rotation = calculate_relative_rotation(
+                pose_next['rotation'],
+                pose_prev['rotation']
+            )
+            pose_vec = torch.cat([translation, relative_rotation], dim=0)  # [7D]
+            pose_vecs.append(pose_vec)
+        pose_sequence = torch.stack(pose_vecs, dim=0)  # [max_needed_frames, 7]
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_sequence, mask], dim=1)
+        print(f"🔧 NuScenes真实pose embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用NuScenes合成pose数据")
+        # 先生成绝对轨迹
+        abs_translations = []
+        abs_rotations = []
+        for i in range(max_needed_frames + 1):  # +1是为了后续做相对
+            angle =  -i * 0.12
+            radius = 8.0
+            x = radius * np.sin(angle)
+            y = 0.0
+            z = radius * (1 - np.cos(angle))
+            abs_translations.append(np.array([x, y, z], dtype=np.float32))
+            yaw = angle + np.pi/2
+            abs_rotations.append(np.array([
+                np.cos(yaw/2), 0.0, 0.0, np.sin(yaw/2)
+            ], dtype=np.float32))
+        # 计算每帧相对上一帧的运动
+        pose_vecs = []
+        for i in range(max_needed_frames):
+            translation = torch.tensor(abs_translations[i+1] - abs_translations[i], dtype=torch.float32)
+            # 计算相对旋转
+            q_next = abs_rotations[i+1]
+            q_prev = abs_rotations[i]
+            # 四元数相对旋转
+            q_prev_inv = np.array([q_prev[0], -q_prev[1], -q_prev[2], -q_prev[3]], dtype=np.float32)
+            w1, x1, y1, z1 = q_prev_inv
+            w2, x2, y2, z2 = q_next
+            relative_rotation = torch.tensor([
+                w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2,
+                w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2,
+                w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2,
+                w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2
+            ], dtype=torch.float32)
+            pose_vec = torch.cat([translation, relative_rotation], dim=0)  # [7D]
+            pose_vecs.append(pose_vec)
+        pose_sequence = torch.stack(pose_vecs, dim=0)
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_sequence, mask], dim=1)
+        print(f"🔧 NuScenes合成相对pose embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def prepare_framepack_sliding_window_with_camera_moe(history_latents, target_frames_to_generate, camera_embedding_full, start_frame, modality_type, max_history_frames=49):
+    """FramePack滑动窗口机制 - MoE版本"""
+    # history_latents: [C, T, H, W] 当前的历史latents
+    C, T, H, W = history_latents.shape
+    # 固定索引结构（这决定了需要的camera帧数）
+    total_indices_length = 1 + 16 + 2 + 1 + target_frames_to_generate
+    indices = torch.arange(0, total_indices_length)
+    split_sizes = [1, 16, 2, 1, target_frames_to_generate]
+    clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = \
+        indices.split(split_sizes, dim=0)
+    clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=0)
+    # 检查camera长度是否足够
+    if camera_embedding_full.shape[0] < total_indices_length:
+        shortage = total_indices_length - camera_embedding_full.shape[0]
+        padding = torch.zeros(shortage, camera_embedding_full.shape[1],
+                            dtype=camera_embedding_full.dtype, device=camera_embedding_full.device)
+        camera_embedding_full = torch.cat([camera_embedding_full, padding], dim=0)
+    # 从完整camera序列中选取对应部分
+    combined_camera = camera_embedding_full[:total_indices_length, :].clone()
+    # 根据当前history length重新设置mask
+    combined_camera[:, -1] = 0.0  # 先全部设为target (0)
+    # 设置condition mask：前19帧根据实际历史长度决定
+    if T > 0:
+        available_frames = min(T, 19)
+        start_pos = 19 - available_frames
+        combined_camera[start_pos:19, -1] = 1.0  # 将有效的clean latents对应的camera标记为condition
+    print(f"🔧 MoE Camera mask更新:")
+    print(f"  - 历史帧数: {T}")
+    print(f"  - 有效condition帧数: {available_frames if T > 0 else 0}")
+    print(f"  - 模态类型: {modality_type}")
+    # 处理latents
+    clean_latents_combined = torch.zeros(C, 19, H, W, dtype=history_latents.dtype, device=history_latents.device)
+    if T > 0:
+        available_frames = min(T, 19)
+        start_pos = 19 - available_frames
+        clean_latents_combined[:, start_pos:, :, :] = history_latents[:, -available_frames:, :, :]
+    clean_latents_4x = clean_latents_combined[:, 0:16, :, :]
+    clean_latents_2x = clean_latents_combined[:, 16:18, :, :]
+    clean_latents_1x = clean_latents_combined[:, 18:19, :, :]
+    if T > 0:
+        start_latent = history_latents[:, 0:1, :, :]
+    else:
+        start_latent = torch.zeros(C, 1, H, W, dtype=history_latents.dtype, device=history_latents.device)
+    clean_latents = torch.cat([start_latent, clean_latents_1x], dim=1)
+    return {
+        'latent_indices': latent_indices,
+        'clean_latents': clean_latents,
+        'clean_latents_2x': clean_latents_2x,
+        'clean_latents_4x': clean_latents_4x,
+        'clean_latent_indices': clean_latent_indices,
+        'clean_latent_2x_indices': clean_latent_2x_indices,
+        'clean_latent_4x_indices': clean_latent_4x_indices,
+        'camera_embedding': combined_camera,
+        'modality_type': modality_type,  # 新增模态类型信息
+        'current_length': T,
+        'next_length': T + target_frames_to_generate
+    }
+def inference_moe_framepack_sliding_window(
+    condition_pth_path,
+    dit_path,
+    output_path="moe/infer_results/output_moe_framepack_sliding.mp4",
+    start_frame=0,
+    initial_condition_frames=8,
+    frames_per_generation=4,
+    total_frames_to_generate=32,
+    max_history_frames=49,
+    device="cuda",
+    prompt="A video of a scene shot using a pedestrian's front camera while walking",
+    modality_type="sekai",  # "sekai" 或 "nuscenes"
+    use_real_poses=True,
+    scene_info_path=None,  # 对于NuScenes数据集
+    # CFG参数
+    use_camera_cfg=True,
+    camera_guidance_scale=2.0,
+    text_guidance_scale=1.0,
+    # MoE参数
+    moe_num_experts=4,
+    moe_top_k=2,
+    moe_hidden_dim=None
+):
+    """
+    MoE FramePack滑动窗口视频生成 - 支持多模态
+    """
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    print(f"🔧 MoE FramePack滑动窗口生成开始...")
+    print(f"模态类型: {modality_type}")
+    print(f"Camera CFG: {use_camera_cfg}, Camera guidance scale: {camera_guidance_scale}")
+    print(f"Text guidance scale: {text_guidance_scale}")
+    print(f"MoE配置: experts={moe_num_experts}, top_k={moe_top_k}")
+    # 1. 模型初始化
+    replace_dit_model_in_manager()
+    model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+    model_manager.load_models([
+        "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+    ])
+    pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager, device="cuda")
+    # 2. 添加传统camera编码器（兼容性）
+    dim = pipe.dit.blocks[0].self_attn.q.weight.shape[0]
+    for block in pipe.dit.blocks:
+        block.cam_encoder = nn.Linear(13, dim)
+        block.projector = nn.Linear(dim, dim)
+        block.cam_encoder.weight.data.zero_()
+        block.cam_encoder.bias.data.zero_()
+        block.projector.weight = nn.Parameter(torch.eye(dim))
+        block.projector.bias = nn.Parameter(torch.zeros(dim))
+    # 3. 添加FramePack组件
+    add_framepack_components(pipe.dit)
+    # 4. 添加MoE组件
+    moe_config = {
+        "num_experts": moe_num_experts,
+        "top_k": moe_top_k,
+        "hidden_dim": moe_hidden_dim or dim * 2,
+        "sekai_input_dim": 13,    # Sekai: 12维pose + 1维mask
+        "nuscenes_input_dim": 8,   # NuScenes: 7维pose + 1维mask
+        "openx_input_dim": 13       # OpenX: 12维pose + 1维mask (类似sekai)
+    }
+    add_moe_components(pipe.dit, moe_config)
+    # 5. 加载训练好的权重
+    dit_state_dict = torch.load(dit_path, map_location="cpu")
+    pipe.dit.load_state_dict(dit_state_dict, strict=False)  # 使用strict=False以兼容新增的MoE组件
+    pipe = pipe.to(device)
+    model_dtype = next(pipe.dit.parameters()).dtype
+    if hasattr(pipe.dit, 'clean_x_embedder'):
+        pipe.dit.clean_x_embedder = pipe.dit.clean_x_embedder.to(dtype=model_dtype)
+    pipe.scheduler.set_timesteps(50)
+    # 6. 加载初始条件
+    print("Loading initial condition frames...")
+    initial_latents, encoded_data = load_encoded_video_from_pth(
+        condition_pth_path,
+        start_frame=start_frame,
+        num_frames=initial_condition_frames
+    )
+    # 空间裁剪
+    target_height, target_width = 60, 104
+    C, T, H, W = initial_latents.shape
+    if H > target_height or W > target_width:
+        h_start = (H - target_height) // 2
+        w_start = (W - target_width) // 2
+        initial_latents = initial_latents[:, :, h_start:h_start+target_height, w_start:w_start+target_width]
+        H, W = target_height, target_width
+    history_latents = initial_latents.to(device, dtype=model_dtype)
+    print(f"初始history_latents shape: {history_latents.shape}")
+    # 7. 编码prompt - 支持CFG
+    if text_guidance_scale > 1.0:
+        prompt_emb_pos = pipe.encode_prompt(prompt)
+        prompt_emb_neg = pipe.encode_prompt("")
+        print(f"使用Text CFG，guidance scale: {text_guidance_scale}")
+    else:
+        prompt_emb_pos = pipe.encode_prompt(prompt)
+        prompt_emb_neg = None
+        print("不使用Text CFG")
+    # 8. 加载场景信息（对于NuScenes）
+    scene_info = None
+    if modality_type == "nuscenes" and scene_info_path and os.path.exists(scene_info_path):
+        with open(scene_info_path, 'r') as f:
+            scene_info = json.load(f)
+        print(f"加载NuScenes场景信息: {scene_info_path}")
+    # 9. 预生成完整的camera embedding序列
+    if modality_type == "sekai":
+        camera_embedding_full = generate_sekai_camera_embeddings_sliding(
+            encoded_data.get('cam_emb', None),
+            0,
+            max_history_frames,
+            0,
+            0,
+            use_real_poses=use_real_poses
+        ).to(device, dtype=model_dtype)
+    elif modality_type == "nuscenes":
+        camera_embedding_full = generate_nuscenes_camera_embeddings_sliding(
+            scene_info,
+            0,
+            max_history_frames,
+            0
+        ).to(device, dtype=model_dtype)
+    elif modality_type == "openx":
+        camera_embedding_full = generate_openx_camera_embeddings_sliding(
+            encoded_data,
+            0,
+            max_history_frames,
+            0,
+            use_real_poses=use_real_poses
+        ).to(device, dtype=model_dtype)
+    else:
+        raise ValueError(f"不支持的模态类型: {modality_type}")
+    print(f"完整camera序列shape: {camera_embedding_full.shape}")
+    # 10. 为Camera CFG创建无条件的camera embedding
+    if use_camera_cfg:
+        camera_embedding_uncond = torch.zeros_like(camera_embedding_full)
+        print(f"创建无条件camera embedding用于CFG")
+    # 11. 滑动窗口生成循环
+    total_generated = 0
+    all_generated_frames = []
+    while total_generated < total_frames_to_generate:
+        current_generation = min(frames_per_generation, total_frames_to_generate - total_generated)
+        print(f"\n🔧 生成步骤 {total_generated // frames_per_generation + 1}")
+        print(f"当前历史长度: {history_latents.shape[1]}, 本次生成: {current_generation}")
+        # FramePack数据准备 - MoE版本
+        framepack_data = prepare_framepack_sliding_window_with_camera_moe(
+            history_latents,
+            current_generation,
+            camera_embedding_full,
+            start_frame,
+            modality_type,
+            max_history_frames
+        )
+        # 准备输入
+        clean_latents = framepack_data['clean_latents'].unsqueeze(0)
+        clean_latents_2x = framepack_data['clean_latents_2x'].unsqueeze(0)
+        clean_latents_4x = framepack_data['clean_latents_4x'].unsqueeze(0)
+        camera_embedding = framepack_data['camera_embedding'].unsqueeze(0)
+        # 准备modality_inputs
+        modality_inputs = {modality_type: camera_embedding}
+        # 为CFG准备无条件camera embedding
+        if use_camera_cfg:
+            camera_embedding_uncond_batch = camera_embedding_uncond[:camera_embedding.shape[1], :].unsqueeze(0)
+            modality_inputs_uncond = {modality_type: camera_embedding_uncond_batch}
+        # 索引处理
+        latent_indices = framepack_data['latent_indices'].unsqueeze(0).cpu()
+        clean_latent_indices = framepack_data['clean_latent_indices'].unsqueeze(0).cpu()
+        clean_latent_2x_indices = framepack_data['clean_latent_2x_indices'].unsqueeze(0).cpu()
+        clean_latent_4x_indices = framepack_data['clean_latent_4x_indices'].unsqueeze(0).cpu()
+        # 初始化要生成的latents
+        new_latents = torch.randn(
+            1, C, current_generation, H, W,
+            device=device, dtype=model_dtype
+        )
+        extra_input = pipe.prepare_extra_input(new_latents)
+        print(f"Camera embedding shape: {camera_embedding.shape}")
+        print(f"Camera mask分布 - condition: {torch.sum(camera_embedding[0, :, -1] == 1.0).item()}, target: {torch.sum(camera_embedding[0, :, -1] == 0.0).item()}")
+        # 去噪循环 - 支持CFG
+        timesteps = pipe.scheduler.timesteps
+        for i, timestep in enumerate(timesteps):
+            if i % 10 == 0:
+                print(f"  去噪步骤 {i+1}/{len(timesteps)}")
+            timestep_tensor = timestep.unsqueeze(0).to(device, dtype=model_dtype)
+            with torch.no_grad():
+                # CFG推理
+                if use_camera_cfg and camera_guidance_scale > 1.0:
+                    # 条件预测（有camera）
+                    noise_pred_cond, moe_loess = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,  # MoE模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+                    # 无条件预测（无camera）
+                    noise_pred_uncond, moe_loess = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding_uncond_batch,
+                        modality_inputs=modality_inputs_uncond,  # MoE无条件模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **(prompt_emb_neg if prompt_emb_neg else prompt_emb_pos),
+                        **extra_input
+                    )
+                    # Camera CFG
+                    noise_pred = noise_pred_uncond + camera_guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    # 如果同时使用Text CFG
+                    if text_guidance_scale > 1.0 and prompt_emb_neg:
+                        noise_pred_text_uncond, moe_loess = pipe.dit(
+                            new_latents,
+                            timestep=timestep_tensor,
+                            cam_emb=camera_embedding,
+                            modality_inputs=modality_inputs,
+                            latent_indices=latent_indices,
+                            clean_latents=clean_latents,
+                            clean_latent_indices=clean_latent_indices,
+                            clean_latents_2x=clean_latents_2x,
+                            clean_latent_2x_indices=clean_latent_2x_indices,
+                            clean_latents_4x=clean_latents_4x,
+                            clean_latent_4x_indices=clean_latent_4x_indices,
+                            **prompt_emb_neg,
+                            **extra_input
+                        )
+                        # 应用Text CFG到已经应用Camera CFG的结果
+                        noise_pred = noise_pred_text_uncond + text_guidance_scale * (noise_pred - noise_pred_text_uncond)
+                elif text_guidance_scale > 1.0 and prompt_emb_neg:
+                    # 只使用Text CFG
+                    noise_pred_cond, moe_loess = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+                    noise_pred_uncond, moe_loess= pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_neg,
+                        **extra_input
+                    )
+                    noise_pred = noise_pred_uncond + text_guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                else:
+                    # 标准推理（无CFG）
+                    noise_pred, moe_loess = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,  # MoE模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+            new_latents = pipe.scheduler.step(noise_pred, timestep, new_latents)
+        # 更新历史
+        new_latents_squeezed = new_latents.squeeze(0)
+        history_latents = torch.cat([history_latents, new_latents_squeezed], dim=1)
+        # 维护滑动窗口
+        if history_latents.shape[1] > max_history_frames:
+            first_frame = history_latents[:, 0:1, :, :]
+            recent_frames = history_latents[:, -(max_history_frames-1):, :, :]
+            history_latents = torch.cat([first_frame, recent_frames], dim=1)
+            print(f"历史窗口已满，保留第一帧+最新{max_history_frames-1}帧")
+        print(f"更新后history_latents shape: {history_latents.shape}")
+        all_generated_frames.append(new_latents_squeezed)
+        total_generated += current_generation
+        print(f"✅ 已生成 {total_generated}/{total_frames_to_generate} 帧")
+    # 12. 解码和保存
+    print("\n🔧 解码生成的视频...")
+    all_generated = torch.cat(all_generated_frames, dim=1)
+    final_video = torch.cat([initial_latents.to(all_generated.device), all_generated], dim=1).unsqueeze(0)
+    print(f"最终视频shape: {final_video.shape}")
+    decoded_video = pipe.decode_video(final_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16))
+    print(f"Saving video to {output_path}")
+    video_np = decoded_video[0].to(torch.float32).permute(1, 2, 3, 0).cpu().numpy()
+    video_np = (video_np * 0.5 + 0.5).clip(0, 1)
+    video_np = (video_np * 255).astype(np.uint8)
+    with imageio.get_writer(output_path, fps=20) as writer:
+        for frame in video_np:
+            writer.append_data(frame)
+    print(f"🔧 MoE FramePack滑动窗口生成完成! 保存到: {output_path}")
+    print(f"总共生成了 {total_generated} 帧 (压缩后), 对应原始 {total_generated * 4} 帧")
+    print(f"使用模态: {modality_type}")
+def main():
+    parser = argparse.ArgumentParser(description="MoE FramePack滑动窗口视频生成 - 支持多模态")
+    # 基础参数
+    parser.add_argument("--condition_pth", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/sekai-game-walking/00100100001_0004650_0004950/encoded_video.pth")
+                       #default="/share_zhuyixuan05/zhuyixuan05/nuscenes_video_generation_dynamic/scenes/scene-0001_CAM_FRONT/encoded_video-480p.pth")
+                       #default="/share_zhuyixuan05/zhuyixuan05/spatialvid/a9a6d37f-0a6c-548a-a494-7d902469f3f2_0000000_0000300/encoded_video.pth")
+                       #default="/share_zhuyixuan05/zhuyixuan05/openx-fractal-encoded/episode_000001/encoded_video.pth")
+    parser.add_argument("--start_frame", type=int, default=0)
+    parser.add_argument("--initial_condition_frames", type=int, default=16)
+    parser.add_argument("--frames_per_generation", type=int, default=8)
+    parser.add_argument("--total_frames_to_generate", type=int, default=24)
+    parser.add_argument("--max_history_frames", type=int, default=100)
+    parser.add_argument("--use_real_poses", default=True)
+    parser.add_argument("--dit_path", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/ICLR2026/framepack_moe/step25000_first.ckpt")
+    parser.add_argument("--output_path", type=str,
+                       default='/home/zhuyixuan05/ReCamMaster/moe/infer_results/output_moe_framepack_sliding.mp4')
+    parser.add_argument("--prompt", type=str,
+                       default="A drone flying scene in a game world  ")
+    parser.add_argument("--device", type=str, default="cuda")
+    # 模态类型参数
+    parser.add_argument("--modality_type", type=str, choices=["sekai", "nuscenes", "openx"], default="sekai",
+                       help="模态类型：sekai 或 nuscenes 或 openx")
+    parser.add_argument("--scene_info_path", type=str, default=None,
+                       help="NuScenes场景信息文件路径（仅用于nuscenes模态）")
+    # CFG参数
+    parser.add_argument("--use_camera_cfg", default=False,
+                       help="使用Camera CFG")
+    parser.add_argument("--camera_guidance_scale", type=float, default=2.0,
+                       help="Camera guidance scale for CFG")
+    parser.add_argument("--text_guidance_scale", type=float, default=1.0,
+                       help="Text guidance scale for CFG")
+    # MoE参数
+    parser.add_argument("--moe_num_experts", type=int, default=3, help="专家数量")
+    parser.add_argument("--moe_top_k", type=int, default=1, help="Top-K专家")
+    parser.add_argument("--moe_hidden_dim", type=int, default=None, help="MoE隐藏层维度")
+    args = parser.parse_args()
+    print(f"🔧 MoE FramePack CFG生成设置:")
+    print(f"模态类型: {args.modality_type}")
+    print(f"Camera CFG: {args.use_camera_cfg}")
+    if args.use_camera_cfg:
+        print(f"Camera guidance scale: {args.camera_guidance_scale}")
+    print(f"Text guidance scale: {args.text_guidance_scale}")
+    print(f"MoE配置: experts={args.moe_num_experts}, top_k={args.moe_top_k}")
+    print(f"DiT{args.dit_path}")
+    # 验证NuScenes参数
+    if args.modality_type == "nuscenes" and not args.scene_info_path:
+        print("⚠️ 使用NuScenes模态但未提供scene_info_path，将使用合成pose数据")
+    inference_moe_framepack_sliding_window(
+        condition_pth_path=args.condition_pth,
+        dit_path=args.dit_path,
+        output_path=args.output_path,
+        start_frame=args.start_frame,
+        initial_condition_frames=args.initial_condition_frames,
+        frames_per_generation=args.frames_per_generation,
+        total_frames_to_generate=args.total_frames_to_generate,
+        max_history_frames=args.max_history_frames,
+        device=args.device,
+        prompt=args.prompt,
+        modality_type=args.modality_type,
+        use_real_poses=args.use_real_poses,
+        scene_info_path=args.scene_info_path,
+        # CFG参数
+        use_camera_cfg=args.use_camera_cfg,
+        camera_guidance_scale=args.camera_guidance_scale,
+        text_guidance_scale=args.text_guidance_scale,
+        # MoE参数
+        moe_num_experts=args.moe_num_experts,
+        moe_top_k=args.moe_top_k,
+        moe_hidden_dim=args.moe_hidden_dim
+    )
+if __name__ == "__main__":
+    main()

scripts/infer_moe_spatialvid.py ADDED Viewed

	@@ -0,0 +1,1008 @@

+import os
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+import imageio
+import json
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import argparse
+from torchvision.transforms import v2
+from einops import rearrange
+import copy
+from scipy.spatial.transform import Rotation as R
+def compute_relative_pose_matrix(pose1, pose2):
+    """
+    计算相邻两帧的相对位姿，返回3×4的相机矩阵 [R_rel | t_rel]
+    参数:
+    pose1: 第i帧的相机位姿，形状为(7,)的数组 [tx1, ty1, tz1, qx1, qy1, qz1, qw1]
+    pose2: 第i+1帧的相机位姿，形状为(7,)的数组 [tx2, ty2, tz2, qx2, qy2, qz2, qw2]
+    返回:
+    relative_matrix: 3×4的相对位姿矩阵，前3列是旋转矩阵R_rel，第4列是平移向量t_rel
+    """
+    # 分离平移向量和四元数
+    t1 = pose1[:3]  # 第i帧平移 [tx1, ty1, tz1]
+    q1 = pose1[3:]  # 第i帧四元数 [qx1, qy1, qz1, qw1]
+    t2 = pose2[:3]  # 第i+1帧平移
+    q2 = pose2[3:]  # 第i+1帧四元数
+    # 1. 计算相对旋转矩阵 R_rel
+    rot1 = R.from_quat(q1)  # 第i帧旋转
+    rot2 = R.from_quat(q2)  # 第i+1帧旋转
+    rot_rel = rot2 * rot1.inv()  # 相对旋转 = 后一帧旋转 × 前一帧旋转的逆
+    R_rel = rot_rel.as_matrix()  # 转换为3×3矩阵
+    # 2. 计算相对平移向量 t_rel
+    R1_T = rot1.as_matrix().T  # 前一帧旋转矩阵的转置（等价于逆）
+    t_rel = R1_T @ (t2 - t1)   # 相对平移 = R1^T × (t2 - t1)
+    # 3. 组合为3×4矩阵 [R_rel | t_rel]
+    relative_matrix = np.hstack([R_rel, t_rel.reshape(3, 1)])
+    return relative_matrix
+def load_encoded_video_from_pth(pth_path, start_frame=0, num_frames=10):
+    """从pth文件加载预编码的视频数据"""
+    print(f"Loading encoded video from {pth_path}")
+    encoded_data = torch.load(pth_path, weights_only=False, map_location="cpu")
+    full_latents = encoded_data['latents']  # [C, T, H, W]
+    print(f"Full latents shape: {full_latents.shape}")
+    print(f"Extracting frames {start_frame} to {start_frame + num_frames}")
+    if start_frame + num_frames > full_latents.shape[1]:
+        raise ValueError(f"Not enough frames: requested {start_frame + num_frames}, available {full_latents.shape[1]}")
+    condition_latents = full_latents[:, start_frame:start_frame + num_frames, :, :]
+    print(f"Extracted condition latents shape: {condition_latents.shape}")
+    return condition_latents, encoded_data
+def compute_relative_pose(pose_a, pose_b, use_torch=False):
+    """计算相机B相对于相机A的相对位姿矩阵"""
+    assert pose_a.shape == (4, 4), f"相机A外参矩阵形状应为(4,4)，实际为{pose_a.shape}"
+    assert pose_b.shape == (4, 4), f"相机B外参矩阵形状应为(4,4)，实际为{pose_b.shape}"
+    if use_torch:
+        if not isinstance(pose_a, torch.Tensor):
+            pose_a = torch.from_numpy(pose_a).float()
+        if not isinstance(pose_b, torch.Tensor):
+            pose_b = torch.from_numpy(pose_b).float()
+        pose_a_inv = torch.inverse(pose_a)
+        relative_pose = torch.matmul(pose_b, pose_a_inv)
+    else:
+        if not isinstance(pose_a, np.ndarray):
+            pose_a = np.array(pose_a, dtype=np.float32)
+        if not isinstance(pose_b, np.ndarray):
+            pose_b = np.array(pose_b, dtype=np.float32)
+        pose_a_inv = np.linalg.inv(pose_a)
+        relative_pose = np.matmul(pose_b, pose_a_inv)
+    return relative_pose
+def replace_dit_model_in_manager():
+    """替换DiT模型类为MoE版本"""
+    from diffsynth.models.wan_video_dit_moe import WanModelMoe
+    from diffsynth.configs.model_config import model_loader_configs
+    for i, config in enumerate(model_loader_configs):
+        keys_hash, keys_hash_with_shape, model_names, model_classes, model_resource = config
+        if 'wan_video_dit' in model_names:
+            new_model_names = []
+            new_model_classes = []
+            for name, cls in zip(model_names, model_classes):
+                if name == 'wan_video_dit':
+                    new_model_names.append(name)
+                    new_model_classes.append(WanModelMoe)
+                    print(f"✅ 替换了模型类: {name} -> WanModelMoe")
+                else:
+                    new_model_names.append(name)
+                    new_model_classes.append(cls)
+            model_loader_configs[i] = (keys_hash, keys_hash_with_shape, new_model_names, new_model_classes, model_resource)
+def add_framepack_components(dit_model):
+    """添加FramePack相关组件"""
+    if not hasattr(dit_model, 'clean_x_embedder'):
+        inner_dim = dit_model.blocks[0].self_attn.q.weight.shape[0]
+        class CleanXEmbedder(nn.Module):
+            def __init__(self, inner_dim):
+                super().__init__()
+                self.proj = nn.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
+                self.proj_2x = nn.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
+                self.proj_4x = nn.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
+            def forward(self, x, scale="1x"):
+                if scale == "1x":
+                    x = x.to(self.proj.weight.dtype)
+                    return self.proj(x)
+                elif scale == "2x":
+                    x = x.to(self.proj_2x.weight.dtype)
+                    return self.proj_2x(x)
+                elif scale == "4x":
+                    x = x.to(self.proj_4x.weight.dtype)
+                    return self.proj_4x(x)
+                else:
+                    raise ValueError(f"Unsupported scale: {scale}")
+        dit_model.clean_x_embedder = CleanXEmbedder(inner_dim)
+        model_dtype = next(dit_model.parameters()).dtype
+        dit_model.clean_x_embedder = dit_model.clean_x_embedder.to(dtype=model_dtype)
+        print("✅ 添加了FramePack的clean_x_embedder组件")
+def add_moe_components(dit_model, moe_config):
+    """🔧 添加MoE相关组件 - 修正版本"""
+    if not hasattr(dit_model, 'moe_config'):
+        dit_model.moe_config = moe_config
+        print("✅ 添加了MoE配置到模型")
+    # 为每个block动态添加MoE组件
+    dim = dit_model.blocks[0].self_attn.q.weight.shape[0]
+    unified_dim = moe_config.get("unified_dim", 25)
+    for i, block in enumerate(dit_model.blocks):
+        from diffsynth.models.wan_video_dit_moe import ModalityProcessor, MultiModalMoE
+        # Sekai模态处理器 - 输出unified_dim
+        block.sekai_processor = ModalityProcessor("sekai", 13, unified_dim)
+        # # NuScenes模态处理器 - 输出unified_dim
+        # block.nuscenes_processor = ModalityProcessor("nuscenes", 8, unified_dim)
+        # MoE网络 - 输入unified_dim，输出dim
+        block.moe = MultiModalMoE(
+            unified_dim=unified_dim,
+            output_dim=dim,  # 输出维度匹配transformer block的dim
+            num_experts=moe_config.get("num_experts", 4),
+            top_k=moe_config.get("top_k", 2)
+        )
+        print(f"✅ Block {i} 添加了MoE组件 (unified_dim: {unified_dim}, experts: {moe_config.get('num_experts', 4)})")
+def generate_sekai_camera_embeddings_sliding(cam_data, start_frame, current_history_length, new_frames, total_generated, use_real_poses=True):
+    """为Sekai数据集生成camera embeddings - 滑动窗口版本"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if use_real_poses and cam_data is not None and 'extrinsic' in cam_data:
+        print("🔧 使用真实Sekai camera数据")
+        cam_extrinsic = cam_data['extrinsic']
+        # 确保生成足够长的camera序列
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 计算Sekai camera序列长度:")
+        print(f"  - 基础需求: {start_frame + current_history_length + new_frames}")
+        print(f"  - FramePack需求: {framepack_needed_frames}")
+        print(f"  - 最终生成: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # 计算当前帧在原始序列中的位置
+            frame_idx = i * time_compression_ratio
+            next_frame_idx = frame_idx + time_compression_ratio
+            if next_frame_idx < len(cam_extrinsic):
+                cam_prev = cam_extrinsic[frame_idx]
+                cam_next = cam_extrinsic[next_frame_idx]
+                relative_pose = compute_relative_pose_matrix(cam_prev, cam_next)
+                relative_poses.append(torch.as_tensor(relative_pose[:3, :]))
+            else:
+                # 超出范围，使用零运动
+                print(f"⚠️ 帧{frame_idx}超出camera数据范围，使用零运动")
+                relative_poses.append(torch.zeros(3, 4))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        # 从start_frame到current_history_length标记为condition
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 Sekai真实camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用Sekai合成camera数据")
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 生成Sekai合成camera帧数: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # 持续左转运动模式
+            yaw_per_frame = 0.05  # 每帧左转（正角度表示左转）
+            forward_speed = 0.005  # 每帧前进距离
+            pose = np.eye(4, dtype=np.float32)
+            # 旋转矩阵（绕Y轴左转）
+            cos_yaw = np.cos(yaw_per_frame)
+            sin_yaw = np.sin(yaw_per_frame)
+            pose[0, 0] = cos_yaw
+            pose[0, 2] = sin_yaw
+            pose[2, 0] = -sin_yaw
+            pose[2, 2] = cos_yaw
+            # 平移（在旋转后的局部坐标系中前进）
+            pose[2, 3] = -forward_speed  # 局部Z轴负方向（前进）
+            # 添加轻微的向心运动，模拟圆形轨迹
+            radius_drift = 0.002  # 向圆心的轻微漂移
+            pose[0, 3] = -radius_drift  # 局部X轴负方向（向左）
+            relative_pose = pose[:3, :]
+            relative_poses.append(torch.as_tensor(relative_pose))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 Sekai合成camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def generate_openx_camera_embeddings_sliding(encoded_data, start_frame, current_history_length, new_frames, use_real_poses):
+    """为OpenX数据集生成camera embeddings - 滑动窗口版本"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if use_real_poses and encoded_data is not None and 'cam_emb' in encoded_data and 'extrinsic' in encoded_data['cam_emb']:
+        print("🔧 使用OpenX真实camera数据")
+        cam_extrinsic = encoded_data['cam_emb']['extrinsic']
+        # 确保生成足够长的camera序列
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 计算OpenX camera序列长度:")
+        print(f"  - 基础需求: {start_frame + current_history_length + new_frames}")
+        print(f"  - FramePack需求: {framepack_needed_frames}")
+        print(f"  - 最终生成: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # OpenX使用4倍间隔，类似sekai但处理更短的序列
+            frame_idx = i * time_compression_ratio
+            next_frame_idx = frame_idx + time_compression_ratio
+            if next_frame_idx < len(cam_extrinsic):
+                cam_prev = cam_extrinsic[frame_idx]
+                cam_next = cam_extrinsic[next_frame_idx]
+                relative_pose = compute_relative_pose(cam_prev, cam_next)
+                relative_poses.append(torch.as_tensor(relative_pose[:3, :]))
+            else:
+                # 超出范围，使用零运动
+                print(f"⚠️ 帧{frame_idx}超出OpenX camera数据范围，使用零运动")
+                relative_poses.append(torch.zeros(3, 4))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        # 从start_frame到current_history_length标记为condition
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 OpenX真实camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用OpenX合成camera数据")
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 生成OpenX合成camera帧数: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # OpenX机器人操作运动模式 - 较小的运动幅度
+            # 模拟机器人手臂的精细操作运动
+            roll_per_frame = 0.02   # 轻微翻滚
+            pitch_per_frame = 0.01  # 轻微俯仰
+            yaw_per_frame = 0.015   # 轻微偏航
+            forward_speed = 0.003   # 较慢的前进速度
+            pose = np.eye(4, dtype=np.float32)
+            # 复合旋转 - 模拟机器人手臂的复杂运动
+            # 绕X轴旋转（roll）
+            cos_roll = np.cos(roll_per_frame)
+            sin_roll = np.sin(roll_per_frame)
+            # 绕Y轴旋转（pitch）
+            cos_pitch = np.cos(pitch_per_frame)
+            sin_pitch = np.sin(pitch_per_frame)
+            # 绕Z轴旋转（yaw）
+            cos_yaw = np.cos(yaw_per_frame)
+            sin_yaw = np.sin(yaw_per_frame)
+            # 简化的复合旋转矩阵（ZYX顺序）
+            pose[0, 0] = cos_yaw * cos_pitch
+            pose[0, 1] = cos_yaw * sin_pitch * sin_roll - sin_yaw * cos_roll
+            pose[0, 2] = cos_yaw * sin_pitch * cos_roll + sin_yaw * sin_roll
+            pose[1, 0] = sin_yaw * cos_pitch
+            pose[1, 1] = sin_yaw * sin_pitch * sin_roll + cos_yaw * cos_roll
+            pose[1, 2] = sin_yaw * sin_pitch * cos_roll - cos_yaw * sin_roll
+            pose[2, 0] = -sin_pitch
+            pose[2, 1] = cos_pitch * sin_roll
+            pose[2, 2] = cos_pitch * cos_roll
+            # 平移 - 模拟机器人操作的精细移动
+            pose[0, 3] = forward_speed * 0.5   # X方向轻微移动
+            pose[1, 3] = forward_speed * 0.3   # Y方向轻微移动
+            pose[2, 3] = -forward_speed        # Z方向（深度）主要移动
+            relative_pose = pose[:3, :]
+            relative_poses.append(torch.as_tensor(relative_pose))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 OpenX合成camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def generate_nuscenes_camera_embeddings_sliding(scene_info, start_frame, current_history_length, new_frames):
+    """为NuScenes数据集生成camera embeddings - 滑动窗口版本 - 修正版，与train_moe.py保持一致"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if scene_info is not None and 'keyframe_poses' in scene_info:
+        print("🔧 使用NuScenes真实pose数据")
+        keyframe_poses = scene_info['keyframe_poses']
+        if len(keyframe_poses) == 0:
+            print("⚠️ NuScenes keyframe_poses为空，使用零pose")
+            max_needed_frames = max(framepack_needed_frames, 30)
+            pose_sequence = torch.zeros(max_needed_frames, 7, dtype=torch.float32)
+            mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+            condition_end = min(start_frame + current_history_length, max_needed_frames)
+            mask[start_frame:condition_end] = 1.0
+            camera_embedding = torch.cat([pose_sequence, mask], dim=1)  # [max_needed_frames, 8]
+            print(f"🔧 NuScenes零pose embedding shape: {camera_embedding.shape}")
+            return camera_embedding.to(torch.bfloat16)
+        # 使用第一个pose作为参考
+        reference_pose = keyframe_poses[0]
+        max_needed_frames = max(framepack_needed_frames, 30)
+        pose_vecs = []
+        for i in range(max_needed_frames):
+            if i < len(keyframe_poses):
+                current_pose = keyframe_poses[i]
+                # 计算相对位移
+                translation = torch.tensor(
+                    np.array(current_pose['translation']) - np.array(reference_pose['translation']),
+                    dtype=torch.float32
+                )
+                # 计算相对旋转（简化版本）
+                rotation = torch.tensor(current_pose['rotation'], dtype=torch.float32)
+                pose_vec = torch.cat([translation, rotation], dim=0)  # [7D]
+            else:
+                # 超出范围，使用零pose
+                pose_vec = torch.cat([
+                    torch.zeros(3, dtype=torch.float32),
+                    torch.tensor([1.0, 0.0, 0.0, 0.0], dtype=torch.float32)
+                ], dim=0)  # [7D]
+            pose_vecs.append(pose_vec)
+        pose_sequence = torch.stack(pose_vecs, dim=0)  # [max_needed_frames, 7]
+        # 创建mask
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_sequence, mask], dim=1)  # [max_needed_frames, 8]
+        print(f"🔧 NuScenes真实pose embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用NuScenes合成pose数据")
+        max_needed_frames = max(framepack_needed_frames, 30)
+        # 创建合成运动序列
+        pose_vecs = []
+        for i in range(max_needed_frames):
+            # 简单的前进运动
+            translation = torch.tensor([0.0, 0.0, i * 0.1], dtype=torch.float32)  # 沿Z轴前进
+            rotation = torch.tensor([1.0, 0.0, 0.0, 0.0], dtype=torch.float32)  # 无旋转
+            pose_vec = torch.cat([translation, rotation], dim=0)  # [7D]
+            pose_vecs.append(pose_vec)
+        pose_sequence = torch.stack(pose_vecs, dim=0)
+        # 创建mask
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_sequence, mask], dim=1)  # [max_needed_frames, 8]
+        print(f"🔧 NuScenes合成pose embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def prepare_framepack_sliding_window_with_camera_moe(history_latents, target_frames_to_generate, camera_embedding_full, start_frame, modality_type, max_history_frames=49):
+    """FramePack滑动窗口机制 - MoE版本"""
+    # history_latents: [C, T, H, W] 当前的历史latents
+    C, T, H, W = history_latents.shape
+    # 固定索引结构（这决定了需要的camera帧数）
+    total_indices_length = 1 + 16 + 2 + 1 + target_frames_to_generate
+    indices = torch.arange(0, total_indices_length)
+    split_sizes = [1, 16, 2, 1, target_frames_to_generate]
+    clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = \
+        indices.split(split_sizes, dim=0)
+    clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=0)
+    # 检查camera长度是否足够
+    if camera_embedding_full.shape[0] < total_indices_length:
+        shortage = total_indices_length - camera_embedding_full.shape[0]
+        padding = torch.zeros(shortage, camera_embedding_full.shape[1],
+                            dtype=camera_embedding_full.dtype, device=camera_embedding_full.device)
+        camera_embedding_full = torch.cat([camera_embedding_full, padding], dim=0)
+    # 从完整camera序列中选取对应部分
+    combined_camera = camera_embedding_full[:total_indices_length, :].clone()
+    # 根据当前history length重新设置mask
+    combined_camera[:, -1] = 0.0  # 先全部设为target (0)
+    # 设置condition mask：前19帧根据实际历史长度决定
+    if T > 0:
+        available_frames = min(T, 19)
+        start_pos = 19 - available_frames
+        combined_camera[start_pos:19, -1] = 1.0  # 将有效的clean latents对应的camera标记为condition
+    print(f"🔧 MoE Camera mask更新:")
+    print(f"  - 历史帧数: {T}")
+    print(f"  - 有效condition帧数: {available_frames if T > 0 else 0}")
+    print(f"  - 模态类型: {modality_type}")
+    # 处理latents
+    clean_latents_combined = torch.zeros(C, 19, H, W, dtype=history_latents.dtype, device=history_latents.device)
+    if T > 0:
+        available_frames = min(T, 19)
+        start_pos = 19 - available_frames
+        clean_latents_combined[:, start_pos:, :, :] = history_latents[:, -available_frames:, :, :]
+    clean_latents_4x = clean_latents_combined[:, 0:16, :, :]
+    clean_latents_2x = clean_latents_combined[:, 16:18, :, :]
+    clean_latents_1x = clean_latents_combined[:, 18:19, :, :]
+    if T > 0:
+        start_latent = history_latents[:, 0:1, :, :]
+    else:
+        start_latent = torch.zeros(C, 1, H, W, dtype=history_latents.dtype, device=history_latents.device)
+    clean_latents = torch.cat([start_latent, clean_latents_1x], dim=1)
+    return {
+        'latent_indices': latent_indices,
+        'clean_latents': clean_latents,
+        'clean_latents_2x': clean_latents_2x,
+        'clean_latents_4x': clean_latents_4x,
+        'clean_latent_indices': clean_latent_indices,
+        'clean_latent_2x_indices': clean_latent_2x_indices,
+        'clean_latent_4x_indices': clean_latent_4x_indices,
+        'camera_embedding': combined_camera,
+        'modality_type': modality_type,  # 新增模态类型信息
+        'current_length': T,
+        'next_length': T + target_frames_to_generate
+    }
+def inference_moe_framepack_sliding_window(
+    condition_pth_path,
+    dit_path,
+    output_path="moe/infer_results/output_moe_framepack_sliding.mp4",
+    start_frame=0,
+    initial_condition_frames=8,
+    frames_per_generation=4,
+    total_frames_to_generate=32,
+    max_history_frames=49,
+    device="cuda",
+    prompt="A video of a scene shot using a pedestrian's front camera while walking",
+    modality_type="sekai",  # "sekai" 或 "nuscenes"
+    use_real_poses=True,
+    scene_info_path=None,  # 对于NuScenes数据集
+    # CFG参数
+    use_camera_cfg=True,
+    camera_guidance_scale=2.0,
+    text_guidance_scale=1.0,
+    # MoE参数
+    moe_num_experts=4,
+    moe_top_k=2,
+    moe_hidden_dim=None
+):
+    """
+    MoE FramePack滑动窗口视频生成 - 支持多模态
+    """
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    print(f"🔧 MoE FramePack滑动窗口生成开始...")
+    print(f"模态类型: {modality_type}")
+    print(f"Camera CFG: {use_camera_cfg}, Camera guidance scale: {camera_guidance_scale}")
+    print(f"Text guidance scale: {text_guidance_scale}")
+    print(f"MoE配置: experts={moe_num_experts}, top_k={moe_top_k}")
+    # 1. 模型初始化
+    replace_dit_model_in_manager()
+    model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+    model_manager.load_models([
+        "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+    ])
+    pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager, device="cuda")
+    # 2. 添加传统camera编码器（兼容性）
+    dim = pipe.dit.blocks[0].self_attn.q.weight.shape[0]
+    for block in pipe.dit.blocks:
+        block.cam_encoder = nn.Linear(13, dim)
+        block.projector = nn.Linear(dim, dim)
+        block.cam_encoder.weight.data.zero_()
+        block.cam_encoder.bias.data.zero_()
+        block.projector.weight = nn.Parameter(torch.eye(dim))
+        block.projector.bias = nn.Parameter(torch.zeros(dim))
+    # 3. 添加FramePack组件
+    add_framepack_components(pipe.dit)
+    # 4. 添加MoE组件
+    moe_config = {
+        "num_experts": moe_num_experts,
+        "top_k": moe_top_k,
+        "hidden_dim": moe_hidden_dim or dim * 2,
+        "sekai_input_dim": 13,    # Sekai: 12维pose + 1维mask
+        "nuscenes_input_dim": 8,   # NuScenes: 7维pose + 1维mask
+        "openx_input_dim": 13       # OpenX: 12维pose + 1维mask (类似sekai)
+    }
+    add_moe_components(pipe.dit, moe_config)
+    # 5. 加载训练好的权重
+    dit_state_dict = torch.load(dit_path, map_location="cpu")
+    pipe.dit.load_state_dict(dit_state_dict, strict=False)  # 使用strict=False以兼容新增的MoE组件
+    pipe = pipe.to(device)
+    model_dtype = next(pipe.dit.parameters()).dtype
+    if hasattr(pipe.dit, 'clean_x_embedder'):
+        pipe.dit.clean_x_embedder = pipe.dit.clean_x_embedder.to(dtype=model_dtype)
+    pipe.scheduler.set_timesteps(50)
+    # 6. 加载初始条件
+    print("Loading initial condition frames...")
+    initial_latents, encoded_data = load_encoded_video_from_pth(
+        condition_pth_path,
+        start_frame=start_frame,
+        num_frames=initial_condition_frames
+    )
+    # 空间裁剪
+    target_height, target_width = 60, 104
+    C, T, H, W = initial_latents.shape
+    if H > target_height or W > target_width:
+        h_start = (H - target_height) // 2
+        w_start = (W - target_width) // 2
+        initial_latents = initial_latents[:, :, h_start:h_start+target_height, w_start:w_start+target_width]
+        H, W = target_height, target_width
+    history_latents = initial_latents.to(device, dtype=model_dtype)
+    print(f"初始history_latents shape: {history_latents.shape}")
+    # 7. 编码prompt - 支持CFG
+    if text_guidance_scale > 1.0:
+        prompt_emb_pos = pipe.encode_prompt(prompt)
+        prompt_emb_neg = pipe.encode_prompt("")
+        print(f"使用Text CFG，guidance scale: {text_guidance_scale}")
+    else:
+        prompt_emb_pos = pipe.encode_prompt(prompt)
+        prompt_emb_neg = None
+        print("不使用Text CFG")
+    # 8. 加载场景信息（对于NuScenes）
+    scene_info = None
+    if modality_type == "nuscenes" and scene_info_path and os.path.exists(scene_info_path):
+        with open(scene_info_path, 'r') as f:
+            scene_info = json.load(f)
+        print(f"加载NuScenes场景信息: {scene_info_path}")
+    # 9. 预生成完整的camera embedding序列
+    if modality_type == "sekai":
+        camera_embedding_full = generate_sekai_camera_embeddings_sliding(
+            encoded_data.get('cam_emb', None),
+            0,
+            max_history_frames,
+            0,
+            0,
+            use_real_poses=use_real_poses
+        ).to(device, dtype=model_dtype)
+    elif modality_type == "nuscenes":
+        camera_embedding_full = generate_nuscenes_camera_embeddings_sliding(
+            scene_info,
+            0,
+            max_history_frames,
+            0
+        ).to(device, dtype=model_dtype)
+    elif modality_type == "openx":
+        camera_embedding_full = generate_openx_camera_embeddings_sliding(
+            encoded_data,
+            0,
+            max_history_frames,
+            0,
+            use_real_poses=use_real_poses
+        ).to(device, dtype=model_dtype)
+    else:
+        raise ValueError(f"不支持的模态类型: {modality_type}")
+    print(f"完整camera序列shape: {camera_embedding_full.shape}")
+    # 10. 为Camera CFG创建无条件的camera embedding
+    if use_camera_cfg:
+        camera_embedding_uncond = torch.zeros_like(camera_embedding_full)
+        print(f"创建无条件camera embedding用于CFG")
+    # 11. 滑动窗口生成循环
+    total_generated = 0
+    all_generated_frames = []
+    while total_generated < total_frames_to_generate:
+        current_generation = min(frames_per_generation, total_frames_to_generate - total_generated)
+        print(f"\n🔧 生成步骤 {total_generated // frames_per_generation + 1}")
+        print(f"当前历史长度: {history_latents.shape[1]}, 本次生成: {current_generation}")
+        # FramePack数据准备 - MoE版本
+        framepack_data = prepare_framepack_sliding_window_with_camera_moe(
+            history_latents,
+            current_generation,
+            camera_embedding_full,
+            start_frame,
+            modality_type,
+            max_history_frames
+        )
+        # 准备输入
+        clean_latents = framepack_data['clean_latents'].unsqueeze(0)
+        clean_latents_2x = framepack_data['clean_latents_2x'].unsqueeze(0)
+        clean_latents_4x = framepack_data['clean_latents_4x'].unsqueeze(0)
+        camera_embedding = framepack_data['camera_embedding'].unsqueeze(0)
+        # 准备modality_inputs
+        modality_inputs = {modality_type: camera_embedding}
+        # 为CFG准备无条件camera embedding
+        if use_camera_cfg:
+            camera_embedding_uncond_batch = camera_embedding_uncond[:camera_embedding.shape[1], :].unsqueeze(0)
+            modality_inputs_uncond = {modality_type: camera_embedding_uncond_batch}
+        # 索引处理
+        latent_indices = framepack_data['latent_indices'].unsqueeze(0).cpu()
+        clean_latent_indices = framepack_data['clean_latent_indices'].unsqueeze(0).cpu()
+        clean_latent_2x_indices = framepack_data['clean_latent_2x_indices'].unsqueeze(0).cpu()
+        clean_latent_4x_indices = framepack_data['clean_latent_4x_indices'].unsqueeze(0).cpu()
+        # 初始化要生成的latents
+        new_latents = torch.randn(
+            1, C, current_generation, H, W,
+            device=device, dtype=model_dtype
+        )
+        extra_input = pipe.prepare_extra_input(new_latents)
+        print(f"Camera embedding shape: {camera_embedding.shape}")
+        print(f"Camera mask分布 - condition: {torch.sum(camera_embedding[0, :, -1] == 1.0).item()}, target: {torch.sum(camera_embedding[0, :, -1] == 0.0).item()}")
+        # 去噪循环 - 支持CFG
+        timesteps = pipe.scheduler.timesteps
+        for i, timestep in enumerate(timesteps):
+            if i % 10 == 0:
+                print(f"  去噪步骤 {i+1}/{len(timesteps)}")
+            timestep_tensor = timestep.unsqueeze(0).to(device, dtype=model_dtype)
+            with torch.no_grad():
+                # CFG推理
+                if use_camera_cfg and camera_guidance_scale > 1.0:
+                    # 条件预测（有camera）
+                    noise_pred_cond, moe_loss = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,  # MoE模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+                    # 无条件预测（无camera）
+                    noise_pred_uncond, moe_loss = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding_uncond_batch,
+                        modality_inputs=modality_inputs_uncond,  # MoE无条件模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **(prompt_emb_neg if prompt_emb_neg else prompt_emb_pos),
+                        **extra_input
+                    )
+                    # Camera CFG
+                    noise_pred = noise_pred_uncond + camera_guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    # 如果同时使用Text CFG
+                    if text_guidance_scale > 1.0 and prompt_emb_neg:
+                        noise_pred_text_uncond, moe_loss = pipe.dit(
+                            new_latents,
+                            timestep=timestep_tensor,
+                            cam_emb=camera_embedding,
+                            modality_inputs=modality_inputs,
+                            latent_indices=latent_indices,
+                            clean_latents=clean_latents,
+                            clean_latent_indices=clean_latent_indices,
+                            clean_latents_2x=clean_latents_2x,
+                            clean_latent_2x_indices=clean_latent_2x_indices,
+                            clean_latents_4x=clean_latents_4x,
+                            clean_latent_4x_indices=clean_latent_4x_indices,
+                            **prompt_emb_neg,
+                            **extra_input
+                        )
+                        # 应用Text CFG到已经应用Camera CFG的结果
+                        noise_pred = noise_pred_text_uncond + text_guidance_scale * (noise_pred - noise_pred_text_uncond)
+                elif text_guidance_scale > 1.0 and prompt_emb_neg:
+                    # 只使用Text CFG
+                    noise_pred_cond, moe_loss = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+                    noise_pred_uncond, moe_loss = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_neg,
+                        **extra_input
+                    )
+                    noise_pred = noise_pred_uncond + text_guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                else:
+                    # 标准推理（无CFG）
+                    noise_pred, moe_loss = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,  # MoE模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+            new_latents = pipe.scheduler.step(noise_pred, timestep, new_latents)
+        # 更新历史
+        new_latents_squeezed = new_latents.squeeze(0)
+        history_latents = torch.cat([history_latents, new_latents_squeezed], dim=1)
+        # 维护滑动窗口
+        if history_latents.shape[1] > max_history_frames:
+            first_frame = history_latents[:, 0:1, :, :]
+            recent_frames = history_latents[:, -(max_history_frames-1):, :, :]
+            history_latents = torch.cat([first_frame, recent_frames], dim=1)
+            print(f"历史窗口已满，保留第一帧+最新{max_history_frames-1}帧")
+        print(f"更新后history_latents shape: {history_latents.shape}")
+        all_generated_frames.append(new_latents_squeezed)
+        total_generated += current_generation
+        print(f"✅ 已生成 {total_generated}/{total_frames_to_generate} 帧")
+    # 12. 解码和保存
+    print("\n🔧 解码生成的视频...")
+    all_generated = torch.cat(all_generated_frames, dim=1)
+    final_video = torch.cat([initial_latents.to(all_generated.device), all_generated], dim=1).unsqueeze(0)
+    print(f"最终视频shape: {final_video.shape}")
+    decoded_video = pipe.decode_video(final_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16))
+    print(f"Saving video to {output_path}")
+    video_np = decoded_video[0].to(torch.float32).permute(1, 2, 3, 0).cpu().numpy()
+    video_np = (video_np * 0.5 + 0.5).clip(0, 1)
+    video_np = (video_np * 255).astype(np.uint8)
+    with imageio.get_writer(output_path, fps=20) as writer:
+        for frame in video_np:
+            writer.append_data(frame)
+    print(f"🔧 MoE FramePack滑动窗口生成完成! 保存到: {output_path}")
+    print(f"总共生成了 {total_generated} 帧 (压缩后), 对应原始 {total_generated * 4} 帧")
+    print(f"使用模态: {modality_type}")
+def main():
+    parser = argparse.ArgumentParser(description="MoE FramePack滑动窗口视频生成 - 支持多模态")
+    # 基础参数
+    parser.add_argument("--condition_pth", type=str,
+                       #default="/share_zhuyixuan05/zhuyixuan05/sekai-game-walking/00100100001_0004650_0004950/encoded_video.pth")
+                       #default="/share_zhuyixuan05/zhuyixuan05/nuscenes_video_generation_dynamic/scenes/scene-0001_CAM_FRONT/encoded_video-480p.pth")
+                       default="/share_zhuyixuan05/zhuyixuan05/spatialvid/a9a6d37f-0a6c-548a-a494-7d902469f3f2_0000000_0000300/encoded_video.pth")
+                       #default="/share_zhuyixuan05/zhuyixuan05/openx-fractal-encoded/episode_000001/encoded_video.pth")
+    parser.add_argument("--start_frame", type=int, default=0)
+    parser.add_argument("--initial_condition_frames", type=int, default=16)
+    parser.add_argument("--frames_per_generation", type=int, default=8)
+    parser.add_argument("--total_frames_to_generate", type=int, default=8)
+    parser.add_argument("--max_history_frames", type=int, default=100)
+    parser.add_argument("--use_real_poses", action="store_true", default=False)
+    parser.add_argument("--dit_path", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/ICLR2026/framepack_moe_spatialvid/step250_moe.ckpt")
+    parser.add_argument("--output_path", type=str,
+                       default='/home/zhuyixuan05/ReCamMaster/moe/infer_results/output_moe_framepack_sliding.mp4')
+    parser.add_argument("--prompt", type=str,
+                       default="A man enter the room")
+    parser.add_argument("--device", type=str, default="cuda")
+    # 模态类型参数
+    parser.add_argument("--modality_type", type=str, choices=["sekai", "nuscenes", "openx"], default="sekai",
+                       help="模态类型：sekai 或 nuscenes 或 openx")
+    parser.add_argument("--scene_info_path", type=str, default=None,
+                       help="NuScenes场景信息文件路径（仅用于nuscenes模态）")
+    # CFG参数
+    parser.add_argument("--use_camera_cfg", default=True,
+                       help="使用Camera CFG")
+    parser.add_argument("--camera_guidance_scale", type=float, default=2.0,
+                       help="Camera guidance scale for CFG")
+    parser.add_argument("--text_guidance_scale", type=float, default=1.0,
+                       help="Text guidance scale for CFG")
+    # MoE参数
+    parser.add_argument("--moe_num_experts", type=int, default=1, help="专家数量")
+    parser.add_argument("--moe_top_k", type=int, default=1, help="Top-K专家")
+    parser.add_argument("--moe_hidden_dim", type=int, default=None, help="MoE隐藏层维度")
+    args = parser.parse_args()
+    print(f"🔧 MoE FramePack CFG生成设置:")
+    print(f"模态类型: {args.modality_type}")
+    print(f"Camera CFG: {args.use_camera_cfg}")
+    if args.use_camera_cfg:
+        print(f"Camera guidance scale: {args.camera_guidance_scale}")
+    print(f"Text guidance scale: {args.text_guidance_scale}")
+    print(f"MoE配置: experts={args.moe_num_experts}, top_k={args.moe_top_k}")
+    # 验证NuScenes参数
+    if args.modality_type == "nuscenes" and not args.scene_info_path:
+        print("⚠️ 使用NuScenes模态但未提供scene_info_path，将使用合成pose数据")
+    inference_moe_framepack_sliding_window(
+        condition_pth_path=args.condition_pth,
+        dit_path=args.dit_path,
+        output_path=args.output_path,
+        start_frame=args.start_frame,
+        initial_condition_frames=args.initial_condition_frames,
+        frames_per_generation=args.frames_per_generation,
+        total_frames_to_generate=args.total_frames_to_generate,
+        max_history_frames=args.max_history_frames,
+        device=args.device,
+        prompt=args.prompt,
+        modality_type=args.modality_type,
+        use_real_poses=args.use_real_poses,
+        scene_info_path=args.scene_info_path,
+        # CFG参数
+        use_camera_cfg=args.use_camera_cfg,
+        camera_guidance_scale=args.camera_guidance_scale,
+        text_guidance_scale=args.text_guidance_scale,
+        # MoE参数
+        moe_num_experts=args.moe_num_experts,
+        moe_top_k=args.moe_top_k,
+        moe_hidden_dim=args.moe_hidden_dim
+    )
+if __name__ == "__main__":
+    main()

scripts/infer_moe_test.py ADDED Viewed

	@@ -0,0 +1,976 @@

+import os
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+import imageio
+import json
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import argparse
+from torchvision.transforms import v2
+from einops import rearrange
+import copy
+def load_encoded_video_from_pth(pth_path, start_frame=0, num_frames=10):
+    """从pth文件加载预编码的视频数据"""
+    print(f"Loading encoded video from {pth_path}")
+    encoded_data = torch.load(pth_path, weights_only=False, map_location="cpu")
+    full_latents = encoded_data['latents']  # [C, T, H, W]
+    print(f"Full latents shape: {full_latents.shape}")
+    print(f"Extracting frames {start_frame} to {start_frame + num_frames}")
+    if start_frame + num_frames > full_latents.shape[1]:
+        raise ValueError(f"Not enough frames: requested {start_frame + num_frames}, available {full_latents.shape[1]}")
+    condition_latents = full_latents[:, start_frame:start_frame + num_frames, :, :]
+    print(f"Extracted condition latents shape: {condition_latents.shape}")
+    return condition_latents, encoded_data
+def compute_relative_pose(pose_a, pose_b, use_torch=False):
+    """计算相机B相对于相机A的相对位姿矩阵"""
+    assert pose_a.shape == (4, 4), f"相机A外参矩阵形状应为(4,4)，实际为{pose_a.shape}"
+    assert pose_b.shape == (4, 4), f"相机B外参矩阵形状应为(4,4)，实际为{pose_b.shape}"
+    if use_torch:
+        if not isinstance(pose_a, torch.Tensor):
+            pose_a = torch.from_numpy(pose_a).float()
+        if not isinstance(pose_b, torch.Tensor):
+            pose_b = torch.from_numpy(pose_b).float()
+        pose_a_inv = torch.inverse(pose_a)
+        relative_pose = torch.matmul(pose_b, pose_a_inv)
+    else:
+        if not isinstance(pose_a, np.ndarray):
+            pose_a = np.array(pose_a, dtype=np.float32)
+        if not isinstance(pose_b, np.ndarray):
+            pose_b = np.array(pose_b, dtype=np.float32)
+        pose_a_inv = np.linalg.inv(pose_a)
+        relative_pose = np.matmul(pose_b, pose_a_inv)
+    return relative_pose
+def replace_dit_model_in_manager():
+    """替换DiT模型类为MoE版本"""
+    from diffsynth.models.wan_video_dit_moe import WanModelMoe
+    from diffsynth.configs.model_config import model_loader_configs
+    for i, config in enumerate(model_loader_configs):
+        keys_hash, keys_hash_with_shape, model_names, model_classes, model_resource = config
+        if 'wan_video_dit' in model_names:
+            new_model_names = []
+            new_model_classes = []
+            for name, cls in zip(model_names, model_classes):
+                if name == 'wan_video_dit':
+                    new_model_names.append(name)
+                    new_model_classes.append(WanModelMoe)
+                    print(f"✅ 替换了模型类: {name} -> WanModelMoe")
+                else:
+                    new_model_names.append(name)
+                    new_model_classes.append(cls)
+            model_loader_configs[i] = (keys_hash, keys_hash_with_shape, new_model_names, new_model_classes, model_resource)
+def add_framepack_components(dit_model):
+    """添加FramePack相关组件"""
+    if not hasattr(dit_model, 'clean_x_embedder'):
+        inner_dim = dit_model.blocks[0].self_attn.q.weight.shape[0]
+        class CleanXEmbedder(nn.Module):
+            def __init__(self, inner_dim):
+                super().__init__()
+                self.proj = nn.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
+                self.proj_2x = nn.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
+                self.proj_4x = nn.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
+            def forward(self, x, scale="1x"):
+                if scale == "1x":
+                    x = x.to(self.proj.weight.dtype)
+                    return self.proj(x)
+                elif scale == "2x":
+                    x = x.to(self.proj_2x.weight.dtype)
+                    return self.proj_2x(x)
+                elif scale == "4x":
+                    x = x.to(self.proj_4x.weight.dtype)
+                    return self.proj_4x(x)
+                else:
+                    raise ValueError(f"Unsupported scale: {scale}")
+        dit_model.clean_x_embedder = CleanXEmbedder(inner_dim)
+        model_dtype = next(dit_model.parameters()).dtype
+        dit_model.clean_x_embedder = dit_model.clean_x_embedder.to(dtype=model_dtype)
+        print("✅ 添加了FramePack的clean_x_embedder组件")
+def add_moe_components(dit_model, moe_config):
+    """🔧 添加MoE相关组件 - 修正版本"""
+    if not hasattr(dit_model, 'moe_config'):
+        dit_model.moe_config = moe_config
+        print("✅ 添加了MoE配置到模型")
+    # 为每个block动态添加MoE组件
+    dim = dit_model.blocks[0].self_attn.q.weight.shape[0]
+    unified_dim = moe_config.get("unified_dim", 25)
+    for i, block in enumerate(dit_model.blocks):
+        from diffsynth.models.wan_video_dit_moe import ModalityProcessor, MultiModalMoE
+        # Sekai模态处理器 - 输出unified_dim
+        block.sekai_processor = ModalityProcessor("sekai", 13, unified_dim)
+        # # NuScenes模态处理器 - 输出unified_dim
+        # block.nuscenes_processor = ModalityProcessor("nuscenes", 8, unified_dim)
+        # MoE网络 - 输入unified_dim，输出dim
+        block.moe = MultiModalMoE(
+            unified_dim=unified_dim,
+            output_dim=dim,  # 输出维度匹配transformer block的dim
+            num_experts=moe_config.get("num_experts", 4),
+            top_k=moe_config.get("top_k", 2)
+        )
+        print(f"✅ Block {i} 添加了MoE组件 (unified_dim: {unified_dim}, experts: {moe_config.get('num_experts', 4)})")
+def generate_sekai_camera_embeddings_sliding(cam_data, start_frame, current_history_length, new_frames, total_generated, use_real_poses=True):
+    """为Sekai数据集生成camera embeddings - 滑动窗口版本"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if use_real_poses and cam_data is not None and 'extrinsic' in cam_data:
+        print("🔧 使用真实Sekai camera数据")
+        cam_extrinsic = cam_data['extrinsic']
+        # 确保生成足够长的camera序列
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 计算Sekai camera序列长度:")
+        print(f"  - 基础需求: {start_frame + current_history_length + new_frames}")
+        print(f"  - FramePack需求: {framepack_needed_frames}")
+        print(f"  - 最终生成: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # 计算当前帧在原始序列中的位置
+            frame_idx = i * time_compression_ratio
+            next_frame_idx = frame_idx + time_compression_ratio
+            if next_frame_idx < len(cam_extrinsic):
+                cam_prev = cam_extrinsic[frame_idx]
+                cam_next = cam_extrinsic[next_frame_idx]
+                relative_pose = compute_relative_pose(cam_prev, cam_next)
+                relative_poses.append(torch.as_tensor(relative_pose[:3, :]))
+            else:
+                # 超出范围，使用零运动
+                print(f"⚠️ 帧{frame_idx}超出camera数据范围，使用零运动")
+                relative_poses.append(torch.zeros(3, 4))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        # 从start_frame到current_history_length标记为condition
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 Sekai真实camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用Sekai合成camera数据")
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 生成Sekai合成camera帧数: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # 持续左转运动模式
+            yaw_per_frame = 0.05  # 每帧左转（正角度表示左转）
+            forward_speed = 0.005  # 每帧前进距离
+            pose = np.eye(4, dtype=np.float32)
+            # 旋转矩阵（绕Y轴左转）
+            cos_yaw = np.cos(yaw_per_frame)
+            sin_yaw = np.sin(yaw_per_frame)
+            pose[0, 0] = cos_yaw
+            pose[0, 2] = sin_yaw
+            pose[2, 0] = -sin_yaw
+            pose[2, 2] = cos_yaw
+            # 平移（在旋转后的局部坐标系中前进）
+            pose[2, 3] = -forward_speed  # 局部Z轴负方向（前进）
+            # 添加轻微的向心运动，模拟圆形轨迹
+            radius_drift = 0.002  # 向圆心的轻微漂移
+            pose[0, 3] = -radius_drift  # 局部X轴负方向（向左）
+            relative_pose = pose[:3, :]
+            relative_poses.append(torch.as_tensor(relative_pose))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 Sekai合成camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def generate_openx_camera_embeddings_sliding(encoded_data, start_frame, current_history_length, new_frames, use_real_poses):
+    """为OpenX数据集生成camera embeddings - 滑动窗口版本"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if use_real_poses and encoded_data is not None and 'cam_emb' in encoded_data and 'extrinsic' in encoded_data['cam_emb']:
+        print("🔧 使用OpenX真实camera数据")
+        cam_extrinsic = encoded_data['cam_emb']['extrinsic']
+        # 确保生成足够长的camera序列
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 计算OpenX camera序列长度:")
+        print(f"  - 基础需求: {start_frame + current_history_length + new_frames}")
+        print(f"  - FramePack需求: {framepack_needed_frames}")
+        print(f"  - 最终生成: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # OpenX使用4倍间隔，类似sekai但处理更短的序列
+            frame_idx = i * time_compression_ratio
+            next_frame_idx = frame_idx + time_compression_ratio
+            if next_frame_idx < len(cam_extrinsic):
+                cam_prev = cam_extrinsic[frame_idx]
+                cam_next = cam_extrinsic[next_frame_idx]
+                relative_pose = compute_relative_pose(cam_prev, cam_next)
+                relative_poses.append(torch.as_tensor(relative_pose[:3, :]))
+            else:
+                # 超出范围，使用零运动
+                print(f"⚠️ 帧{frame_idx}超出OpenX camera数据范围，使用零运动")
+                relative_poses.append(torch.zeros(3, 4))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        # 从start_frame到current_history_length标记为condition
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 OpenX真实camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用OpenX合成camera数据")
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 生成OpenX合成camera帧数: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # OpenX机器人操作运动模式 - 较小的运动幅度
+            # 模拟机器人手臂的精细操作运动
+            roll_per_frame = 0.02   # 轻微翻滚
+            pitch_per_frame = 0.01  # 轻微俯仰
+            yaw_per_frame = 0.015   # 轻微偏航
+            forward_speed = 0.003   # 较慢的前进速度
+            pose = np.eye(4, dtype=np.float32)
+            # 复合旋转 - 模拟机器人手臂的复杂运动
+            # 绕X轴旋转（roll）
+            cos_roll = np.cos(roll_per_frame)
+            sin_roll = np.sin(roll_per_frame)
+            # 绕Y轴旋转（pitch）
+            cos_pitch = np.cos(pitch_per_frame)
+            sin_pitch = np.sin(pitch_per_frame)
+            # 绕Z轴旋转（yaw）
+            cos_yaw = np.cos(yaw_per_frame)
+            sin_yaw = np.sin(yaw_per_frame)
+            # 简化的复合旋转矩阵（ZYX顺序）
+            pose[0, 0] = cos_yaw * cos_pitch
+            pose[0, 1] = cos_yaw * sin_pitch * sin_roll - sin_yaw * cos_roll
+            pose[0, 2] = cos_yaw * sin_pitch * cos_roll + sin_yaw * sin_roll
+            pose[1, 0] = sin_yaw * cos_pitch
+            pose[1, 1] = sin_yaw * sin_pitch * sin_roll + cos_yaw * cos_roll
+            pose[1, 2] = sin_yaw * sin_pitch * cos_roll - cos_yaw * sin_roll
+            pose[2, 0] = -sin_pitch
+            pose[2, 1] = cos_pitch * sin_roll
+            pose[2, 2] = cos_pitch * cos_roll
+            # 平移 - 模拟机器人操作的精细移动
+            pose[0, 3] = forward_speed * 0.5   # X方向轻微移动
+            pose[1, 3] = forward_speed * 0.3   # Y��向轻微移动
+            pose[2, 3] = -forward_speed        # Z方向（深度）主要移动
+            relative_pose = pose[:3, :]
+            relative_poses.append(torch.as_tensor(relative_pose))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 OpenX合成camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def generate_nuscenes_camera_embeddings_sliding(scene_info, start_frame, current_history_length, new_frames):
+    """为NuScenes数据集生成camera embeddings - 滑动窗口版本 - 修正版，与train_moe.py保持一致"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if scene_info is not None and 'keyframe_poses' in scene_info:
+        print("🔧 使用NuScenes真实pose数据")
+        keyframe_poses = scene_info['keyframe_poses']
+        if len(keyframe_poses) == 0:
+            print("⚠️ NuScenes keyframe_poses为空，使用零pose")
+            max_needed_frames = max(framepack_needed_frames, 30)
+            pose_sequence = torch.zeros(max_needed_frames, 7, dtype=torch.float32)
+            mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+            condition_end = min(start_frame + current_history_length, max_needed_frames)
+            mask[start_frame:condition_end] = 1.0
+            camera_embedding = torch.cat([pose_sequence, mask], dim=1)  # [max_needed_frames, 8]
+            print(f"🔧 NuScenes零pose embedding shape: {camera_embedding.shape}")
+            return camera_embedding.to(torch.bfloat16)
+        # 使用第一个pose作为参考
+        reference_pose = keyframe_poses[0]
+        max_needed_frames = max(framepack_needed_frames, 30)
+        pose_vecs = []
+        for i in range(max_needed_frames):
+            if i < len(keyframe_poses):
+                current_pose = keyframe_poses[i]
+                # 计算相对位移
+                translation = torch.tensor(
+                    np.array(current_pose['translation']) - np.array(reference_pose['translation']),
+                    dtype=torch.float32
+                )
+                # 计算相对旋转（简化版本）
+                rotation = torch.tensor(current_pose['rotation'], dtype=torch.float32)
+                pose_vec = torch.cat([translation, rotation], dim=0)  # [7D]
+            else:
+                # 超出范围，使用零pose
+                pose_vec = torch.cat([
+                    torch.zeros(3, dtype=torch.float32),
+                    torch.tensor([1.0, 0.0, 0.0, 0.0], dtype=torch.float32)
+                ], dim=0)  # [7D]
+            pose_vecs.append(pose_vec)
+        pose_sequence = torch.stack(pose_vecs, dim=0)  # [max_needed_frames, 7]
+        # 创建mask
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_sequence, mask], dim=1)  # [max_needed_frames, 8]
+        print(f"🔧 NuScenes真实pose embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用NuScenes合成pose数据")
+        max_needed_frames = max(framepack_needed_frames, 30)
+        # 创建合成运动序列
+        pose_vecs = []
+        for i in range(max_needed_frames):
+            # 简单的前进运动
+            translation = torch.tensor([0.0, 0.0, i * 0.1], dtype=torch.float32)  # 沿Z轴前进
+            rotation = torch.tensor([1.0, 0.0, 0.0, 0.0], dtype=torch.float32)  # 无旋转
+            pose_vec = torch.cat([translation, rotation], dim=0)  # [7D]
+            pose_vecs.append(pose_vec)
+        pose_sequence = torch.stack(pose_vecs, dim=0)
+        # 创建mask
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_sequence, mask], dim=1)  # [max_needed_frames, 8]
+        print(f"🔧 NuScenes合成pose embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def prepare_framepack_sliding_window_with_camera_moe(history_latents, target_frames_to_generate, camera_embedding_full, start_frame, modality_type, max_history_frames=49):
+    """FramePack滑动窗口机制 - MoE版本"""
+    # history_latents: [C, T, H, W] 当前的历史latents
+    C, T, H, W = history_latents.shape
+    # 固定索引结构（这决定了需要的camera帧数）
+    total_indices_length = 1 + 16 + 2 + 1 + target_frames_to_generate
+    indices = torch.arange(0, total_indices_length)
+    split_sizes = [1, 16, 2, 1, target_frames_to_generate]
+    clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = \
+        indices.split(split_sizes, dim=0)
+    clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=0)
+    # 检查camera长度是否足够
+    if camera_embedding_full.shape[0] < total_indices_length:
+        shortage = total_indices_length - camera_embedding_full.shape[0]
+        padding = torch.zeros(shortage, camera_embedding_full.shape[1],
+                            dtype=camera_embedding_full.dtype, device=camera_embedding_full.device)
+        camera_embedding_full = torch.cat([camera_embedding_full, padding], dim=0)
+    # 从完整camera序列中选取对应部分
+    combined_camera = camera_embedding_full[:total_indices_length, :].clone()
+    # 根据当前history length重新设置mask
+    combined_camera[:, -1] = 0.0  # 先全部设为target (0)
+    # 设置condition mask：前19帧根据实际历史长度决定
+    if T > 0:
+        available_frames = min(T, 19)
+        start_pos = 19 - available_frames
+        combined_camera[start_pos:19, -1] = 1.0  # 将有效的clean latents对应的camera标记为condition
+    print(f"🔧 MoE Camera mask更新:")
+    print(f"  - 历史帧数: {T}")
+    print(f"  - 有效condition帧数: {available_frames if T > 0 else 0}")
+    print(f"  - 模态类型: {modality_type}")
+    # 处理latents
+    clean_latents_combined = torch.zeros(C, 19, H, W, dtype=history_latents.dtype, device=history_latents.device)
+    if T > 0:
+        available_frames = min(T, 19)
+        start_pos = 19 - available_frames
+        clean_latents_combined[:, start_pos:, :, :] = history_latents[:, -available_frames:, :, :]
+    clean_latents_4x = clean_latents_combined[:, 0:16, :, :]
+    clean_latents_2x = clean_latents_combined[:, 16:18, :, :]
+    clean_latents_1x = clean_latents_combined[:, 18:19, :, :]
+    if T > 0:
+        start_latent = history_latents[:, 0:1, :, :]
+    else:
+        start_latent = torch.zeros(C, 1, H, W, dtype=history_latents.dtype, device=history_latents.device)
+    clean_latents = torch.cat([start_latent, clean_latents_1x], dim=1)
+    return {
+        'latent_indices': latent_indices,
+        'clean_latents': clean_latents,
+        'clean_latents_2x': clean_latents_2x,
+        'clean_latents_4x': clean_latents_4x,
+        'clean_latent_indices': clean_latent_indices,
+        'clean_latent_2x_indices': clean_latent_2x_indices,
+        'clean_latent_4x_indices': clean_latent_4x_indices,
+        'camera_embedding': combined_camera,
+        'modality_type': modality_type,  # 新增模态类型信息
+        'current_length': T,
+        'next_length': T + target_frames_to_generate
+    }
+def inference_moe_framepack_sliding_window(
+    condition_pth_path,
+    dit_path,
+    output_path="moe/infer_results/output_moe_framepack_sliding.mp4",
+    start_frame=0,
+    initial_condition_frames=8,
+    frames_per_generation=4,
+    total_frames_to_generate=32,
+    max_history_frames=49,
+    device="cuda",
+    prompt="A video of a scene shot using a pedestrian's front camera while walking",
+    modality_type="sekai",  # "sekai" 或 "nuscenes"
+    use_real_poses=True,
+    scene_info_path=None,  # 对于NuScenes数据集
+    # CFG参数
+    use_camera_cfg=True,
+    camera_guidance_scale=2.0,
+    text_guidance_scale=1.0,
+    # MoE参数
+    moe_num_experts=4,
+    moe_top_k=2,
+    moe_hidden_dim=None
+):
+    """
+    MoE FramePack滑动窗口视频生成 - 支持多模态
+    """
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    print(f"🔧 MoE FramePack滑动窗口生成开始...")
+    print(f"模态类型: {modality_type}")
+    print(f"Camera CFG: {use_camera_cfg}, Camera guidance scale: {camera_guidance_scale}")
+    print(f"Text guidance scale: {text_guidance_scale}")
+    print(f"MoE配置: experts={moe_num_experts}, top_k={moe_top_k}")
+    # 1. 模型初始化
+    replace_dit_model_in_manager()
+    model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+    model_manager.load_models([
+        "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+    ])
+    pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager, device="cuda")
+    # 2. 添加传统camera编码器（兼容性）
+    dim = pipe.dit.blocks[0].self_attn.q.weight.shape[0]
+    for block in pipe.dit.blocks:
+        block.cam_encoder = nn.Linear(13, dim)
+        block.projector = nn.Linear(dim, dim)
+        block.cam_encoder.weight.data.zero_()
+        block.cam_encoder.bias.data.zero_()
+        block.projector.weight = nn.Parameter(torch.eye(dim))
+        block.projector.bias = nn.Parameter(torch.zeros(dim))
+    # 3. 添加FramePack组件
+    add_framepack_components(pipe.dit)
+    # 4. 添加MoE组件
+    moe_config = {
+        "num_experts": moe_num_experts,
+        "top_k": moe_top_k,
+        "hidden_dim": moe_hidden_dim or dim * 2,
+        "sekai_input_dim": 13,    # Sekai: 12维pose + 1维mask
+        "nuscenes_input_dim": 8,   # NuScenes: 7维pose + 1维mask
+        "openx_input_dim": 13       # OpenX: 12维pose + 1维mask (类似sekai)
+    }
+    add_moe_components(pipe.dit, moe_config)
+    # 5. 加载训练好的权重
+    dit_state_dict = torch.load(dit_path, map_location="cpu")
+    pipe.dit.load_state_dict(dit_state_dict, strict=False)  # 使用strict=False以兼容新增的MoE组件
+    pipe = pipe.to(device)
+    model_dtype = next(pipe.dit.parameters()).dtype
+    if hasattr(pipe.dit, 'clean_x_embedder'):
+        pipe.dit.clean_x_embedder = pipe.dit.clean_x_embedder.to(dtype=model_dtype)
+    pipe.scheduler.set_timesteps(50)
+    # 6. 加载初始条件
+    print("Loading initial condition frames...")
+    initial_latents, encoded_data = load_encoded_video_from_pth(
+        condition_pth_path,
+        start_frame=start_frame,
+        num_frames=initial_condition_frames
+    )
+    # 空间裁剪
+    target_height, target_width = 60, 104
+    C, T, H, W = initial_latents.shape
+    if H > target_height or W > target_width:
+        h_start = (H - target_height) // 2
+        w_start = (W - target_width) // 2
+        initial_latents = initial_latents[:, :, h_start:h_start+target_height, w_start:w_start+target_width]
+        H, W = target_height, target_width
+    history_latents = initial_latents.to(device, dtype=model_dtype)
+    print(f"初始history_latents shape: {history_latents.shape}")
+    # 7. 编码prompt - 支持CFG
+    if text_guidance_scale > 1.0:
+        prompt_emb_pos = pipe.encode_prompt(prompt)
+        prompt_emb_neg = pipe.encode_prompt("")
+        print(f"使用Text CFG，guidance scale: {text_guidance_scale}")
+    else:
+        prompt_emb_pos = pipe.encode_prompt(prompt)
+        prompt_emb_neg = None
+        print("不使用Text CFG")
+    # 8. 加载场景信息（对于NuScenes）
+    scene_info = None
+    if modality_type == "nuscenes" and scene_info_path and os.path.exists(scene_info_path):
+        with open(scene_info_path, 'r') as f:
+            scene_info = json.load(f)
+        print(f"加载NuScenes场景信息: {scene_info_path}")
+    # 9. 预生成完整的camera embedding序列
+    if modality_type == "sekai":
+        camera_embedding_full = generate_sekai_camera_embeddings_sliding(
+            encoded_data.get('cam_emb', None),
+            0,
+            max_history_frames,
+            0,
+            0,
+            use_real_poses=use_real_poses
+        ).to(device, dtype=model_dtype)
+    elif modality_type == "nuscenes":
+        camera_embedding_full = generate_nuscenes_camera_embeddings_sliding(
+            scene_info,
+            0,
+            max_history_frames,
+            0
+        ).to(device, dtype=model_dtype)
+    elif modality_type == "openx":
+        camera_embedding_full = generate_openx_camera_embeddings_sliding(
+            encoded_data,
+            0,
+            max_history_frames,
+            0,
+            use_real_poses=use_real_poses
+        ).to(device, dtype=model_dtype)
+    else:
+        raise ValueError(f"不支持的模态类型: {modality_type}")
+    print(f"完整camera序列shape: {camera_embedding_full.shape}")
+    # 10. 为Camera CFG创建无条件的camera embedding
+    if use_camera_cfg:
+        camera_embedding_uncond = torch.zeros_like(camera_embedding_full)
+        print(f"创建无条件camera embedding用于CFG")
+    # 11. 滑动窗口生成循环
+    total_generated = 0
+    all_generated_frames = []
+    while total_generated < total_frames_to_generate:
+        current_generation = min(frames_per_generation, total_frames_to_generate - total_generated)
+        print(f"\n🔧 生成步骤 {total_generated // frames_per_generation + 1}")
+        print(f"当前历史长度: {history_latents.shape[1]}, 本次生成: {current_generation}")
+        # FramePack数据准备 - MoE版本
+        framepack_data = prepare_framepack_sliding_window_with_camera_moe(
+            history_latents,
+            current_generation,
+            camera_embedding_full,
+            start_frame,
+            modality_type,
+            max_history_frames
+        )
+        # 准备输入
+        clean_latents = framepack_data['clean_latents'].unsqueeze(0)
+        clean_latents_2x = framepack_data['clean_latents_2x'].unsqueeze(0)
+        clean_latents_4x = framepack_data['clean_latents_4x'].unsqueeze(0)
+        camera_embedding = framepack_data['camera_embedding'].unsqueeze(0)
+        # 准备modality_inputs
+        modality_inputs = {modality_type: camera_embedding}
+        # 为CFG准备无条件camera embedding
+        if use_camera_cfg:
+            camera_embedding_uncond_batch = camera_embedding_uncond[:camera_embedding.shape[1], :].unsqueeze(0)
+            modality_inputs_uncond = {modality_type: camera_embedding_uncond_batch}
+        # 索引处理
+        latent_indices = framepack_data['latent_indices'].unsqueeze(0).cpu()
+        clean_latent_indices = framepack_data['clean_latent_indices'].unsqueeze(0).cpu()
+        clean_latent_2x_indices = framepack_data['clean_latent_2x_indices'].unsqueeze(0).cpu()
+        clean_latent_4x_indices = framepack_data['clean_latent_4x_indices'].unsqueeze(0).cpu()
+        # 初始化要生成的latents
+        new_latents = torch.randn(
+            1, C, current_generation, H, W,
+            device=device, dtype=model_dtype
+        )
+        extra_input = pipe.prepare_extra_input(new_latents)
+        print(f"Camera embedding shape: {camera_embedding.shape}")
+        print(f"Camera mask分布 - condition: {torch.sum(camera_embedding[0, :, -1] == 1.0).item()}, target: {torch.sum(camera_embedding[0, :, -1] == 0.0).item()}")
+        # 去噪循环 - 支持CFG
+        timesteps = pipe.scheduler.timesteps
+        for i, timestep in enumerate(timesteps):
+            if i % 10 == 0:
+                print(f"  去噪步骤 {i+1}/{len(timesteps)}")
+            timestep_tensor = timestep.unsqueeze(0).to(device, dtype=model_dtype)
+            with torch.no_grad():
+                # CFG推理
+                if use_camera_cfg and camera_guidance_scale > 1.0:
+                    # 条件预测（有camera）
+                    noise_pred_cond, moe_loss = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,  # MoE模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+                    # 无条件预测（无camera）
+                    noise_pred_uncond, moe_loss = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding_uncond_batch,
+                        modality_inputs=modality_inputs_uncond,  # MoE无条件模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **(prompt_emb_neg if prompt_emb_neg else prompt_emb_pos),
+                        **extra_input
+                    )
+                    # Camera CFG
+                    noise_pred = noise_pred_uncond + camera_guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    # 如果同时使用Text CFG
+                    if text_guidance_scale > 1.0 and prompt_emb_neg:
+                        noise_pred_text_uncond, moe_loss = pipe.dit(
+                            new_latents,
+                            timestep=timestep_tensor,
+                            cam_emb=camera_embedding,
+                            modality_inputs=modality_inputs,
+                            latent_indices=latent_indices,
+                            clean_latents=clean_latents,
+                            clean_latent_indices=clean_latent_indices,
+                            clean_latents_2x=clean_latents_2x,
+                            clean_latent_2x_indices=clean_latent_2x_indices,
+                            clean_latents_4x=clean_latents_4x,
+                            clean_latent_4x_indices=clean_latent_4x_indices,
+                            **prompt_emb_neg,
+                            **extra_input
+                        )
+                        # 应用Text CFG到已经应用Camera CFG的结果
+                        noise_pred = noise_pred_text_uncond + text_guidance_scale * (noise_pred - noise_pred_text_uncond)
+                elif text_guidance_scale > 1.0 and prompt_emb_neg:
+                    # 只使用Text CFG
+                    noise_pred_cond, moe_loss = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+                    noise_pred_uncond, moe_loss = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_neg,
+                        **extra_input
+                    )
+                    noise_pred = noise_pred_uncond + text_guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                else:
+                    # 标准推理（无CFG）
+                    noise_pred, moe_loss = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,  # MoE模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+            new_latents = pipe.scheduler.step(noise_pred, timestep, new_latents)
+        # 更新历史
+        new_latents_squeezed = new_latents.squeeze(0)
+        history_latents = torch.cat([history_latents, new_latents_squeezed], dim=1)
+        # 维护滑动窗口
+        if history_latents.shape[1] > max_history_frames:
+            first_frame = history_latents[:, 0:1, :, :]
+            recent_frames = history_latents[:, -(max_history_frames-1):, :, :]
+            history_latents = torch.cat([first_frame, recent_frames], dim=1)
+            print(f"历史窗口已满，保留第一帧+最新{max_history_frames-1}帧")
+        print(f"更新后history_latents shape: {history_latents.shape}")
+        all_generated_frames.append(new_latents_squeezed)
+        total_generated += current_generation
+        print(f"✅ 已生成 {total_generated}/{total_frames_to_generate} 帧")
+    # 12. 解码和保存
+    print("\n🔧 解码生成的视频...")
+    all_generated = torch.cat(all_generated_frames, dim=1)
+    final_video = torch.cat([initial_latents.to(all_generated.device), all_generated], dim=1).unsqueeze(0)
+    print(f"最终视频shape: {final_video.shape}")
+    decoded_video = pipe.decode_video(final_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16))
+    print(f"Saving video to {output_path}")
+    video_np = decoded_video[0].to(torch.float32).permute(1, 2, 3, 0).cpu().numpy()
+    video_np = (video_np * 0.5 + 0.5).clip(0, 1)
+    video_np = (video_np * 255).astype(np.uint8)
+    with imageio.get_writer(output_path, fps=20) as writer:
+        for frame in video_np:
+            writer.append_data(frame)
+    print(f"🔧 MoE FramePack滑动窗口生成完成! 保存到: {output_path}")
+    print(f"总共生成了 {total_generated} 帧 (压缩后), 对应原始 {total_generated * 4} 帧")
+    print(f"使用模态: {modality_type}")
+def main():
+    parser = argparse.ArgumentParser(description="MoE FramePack滑动窗口视频生成 - 支持多模态")
+    # 基��参数
+    parser.add_argument("--condition_pth", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/sekai-game-walking/00100100001_0004650_0004950/encoded_video.pth")
+                       #default="/share_zhuyixuan05/zhuyixuan05/nuscenes_video_generation_dynamic/scenes/scene-0001_CAM_FRONT/encoded_video-480p.pth")
+                       #default="/share_zhuyixuan05/zhuyixuan05/spatialvid/a9a6d37f-0a6c-548a-a494-7d902469f3f2_0000000_0000300/encoded_video.pth")
+                       #default="/share_zhuyixuan05/zhuyixuan05/openx-fractal-encoded/episode_000001/encoded_video.pth")
+    parser.add_argument("--start_frame", type=int, default=0)
+    parser.add_argument("--initial_condition_frames", type=int, default=16)
+    parser.add_argument("--frames_per_generation", type=int, default=8)
+    parser.add_argument("--total_frames_to_generate", type=int, default=40)
+    parser.add_argument("--max_history_frames", type=int, default=100)
+    parser.add_argument("--use_real_poses", action="store_true", default=False)
+    parser.add_argument("--dit_path", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/ICLR2026/framepack_moe_test/step1000_moe.ckpt")
+    parser.add_argument("--output_path", type=str,
+                       default='/home/zhuyixuan05/ReCamMaster/moe/infer_results/output_moe_framepack_sliding.mp4')
+    parser.add_argument("--prompt", type=str,
+                       default="A drone flying scene in a game world")
+    parser.add_argument("--device", type=str, default="cuda")
+    # 模态类型参数
+    parser.add_argument("--modality_type", type=str, choices=["sekai", "nuscenes", "openx"], default="sekai",
+                       help="模态类型：sekai 或 nuscenes 或 openx")
+    parser.add_argument("--scene_info_path", type=str, default=None,
+                       help="NuScenes场景信息文件路径（仅用于nuscenes模态）")
+    # CFG参数
+    parser.add_argument("--use_camera_cfg", default=True,
+                       help="使用Camera CFG")
+    parser.add_argument("--camera_guidance_scale", type=float, default=2.0,
+                       help="Camera guidance scale for CFG")
+    parser.add_argument("--text_guidance_scale", type=float, default=1.0,
+                       help="Text guidance scale for CFG")
+    # MoE参数
+    parser.add_argument("--moe_num_experts", type=int, default=1, help="专家数量")
+    parser.add_argument("--moe_top_k", type=int, default=1, help="Top-K专家")
+    parser.add_argument("--moe_hidden_dim", type=int, default=None, help="MoE隐藏层维度")
+    args = parser.parse_args()
+    print(f"🔧 MoE FramePack CFG生成设置:")
+    print(f"模态类型: {args.modality_type}")
+    print(f"Camera CFG: {args.use_camera_cfg}")
+    if args.use_camera_cfg:
+        print(f"Camera guidance scale: {args.camera_guidance_scale}")
+    print(f"Text guidance scale: {args.text_guidance_scale}")
+    print(f"MoE配置: experts={args.moe_num_experts}, top_k={args.moe_top_k}")
+    # 验证NuScenes参数
+    if args.modality_type == "nuscenes" and not args.scene_info_path:
+        print("⚠️ 使用NuScenes模态但未提供scene_info_path，将使用合成pose数据")
+    inference_moe_framepack_sliding_window(
+        condition_pth_path=args.condition_pth,
+        dit_path=args.dit_path,
+        output_path=args.output_path,
+        start_frame=args.start_frame,
+        initial_condition_frames=args.initial_condition_frames,
+        frames_per_generation=args.frames_per_generation,
+        total_frames_to_generate=args.total_frames_to_generate,
+        max_history_frames=args.max_history_frames,
+        device=args.device,
+        prompt=args.prompt,
+        modality_type=args.modality_type,
+        use_real_poses=args.use_real_poses,
+        scene_info_path=args.scene_info_path,
+        # CFG参数
+        use_camera_cfg=args.use_camera_cfg,
+        camera_guidance_scale=args.camera_guidance_scale,
+        text_guidance_scale=args.text_guidance_scale,
+        # MoE参数
+        moe_num_experts=args.moe_num_experts,
+        moe_top_k=args.moe_top_k,
+        moe_hidden_dim=args.moe_hidden_dim
+    )
+if __name__ == "__main__":
+    main()

scripts/infer_nus.py ADDED Viewed

	@@ -0,0 +1,500 @@

+import os
+import torch
+import numpy as np
+from PIL import Image
+import imageio
+import json
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import argparse
+from torchvision.transforms import v2
+from einops import rearrange
+import torch.nn as nn
+from pose_classifier import PoseClassifier
+def load_video_frames(video_path, num_frames=20, height=900, width=1600):
+    """Load video frames and preprocess them"""
+    frame_process = v2.Compose([
+        # v2.CenterCrop(size=(height, width)),
+        # v2.Resize(size=(height, width), antialias=True),
+        v2.ToTensor(),
+        v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+    ])
+    def crop_and_resize(image):
+        w, h = image.size
+        # scale = max(width / w, height / h)
+        image = v2.functional.resize(
+            image,
+            (round(480), round(832)),
+            interpolation=v2.InterpolationMode.BILINEAR
+        )
+        return image
+    reader = imageio.get_reader(video_path)
+    frames = []
+    for i, frame_data in enumerate(reader):
+        if i >= num_frames:
+            break
+        frame = Image.fromarray(frame_data)
+        frame = crop_and_resize(frame)
+        frame = frame_process(frame)
+        frames.append(frame)
+    reader.close()
+    if len(frames) == 0:
+        return None
+    frames = torch.stack(frames, dim=0)
+    frames = rearrange(frames, "T C H W -> C T H W")
+    return frames
+def calculate_relative_rotation(current_rotation, reference_rotation):
+    """计算相对旋转四元数"""
+    q_current = torch.tensor(current_rotation, dtype=torch.float32)
+    q_ref = torch.tensor(reference_rotation, dtype=torch.float32)
+    # 计算参考旋转的逆 (q_ref^-1)
+    q_ref_inv = torch.tensor([q_ref[0], -q_ref[1], -q_ref[2], -q_ref[3]])
+    # 四元数乘法计算相对旋转: q_relative = q_ref^-1 * q_current
+    w1, x1, y1, z1 = q_ref_inv
+    w2, x2, y2, z2 = q_current
+    relative_rotation = torch.tensor([
+        w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2,
+        w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2,
+        w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2,
+        w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2
+    ])
+    return relative_rotation
+def generate_direction_poses(direction="left", target_frames=10, condition_frames=20):
+    """
+    根据指定方向生成pose类别embedding，包含condition和target帧
+    Args:
+        direction: 'forward', 'backward', 'left_turn', 'right_turn'
+        target_frames: 目标帧数
+        condition_frames: 条件帧数
+    """
+    classifier = PoseClassifier()
+    total_frames = condition_frames + target_frames
+    print(f"conditon{condition_frames}")
+    print(f"target{target_frames}")
+    poses = []
+    # 🔧 生成condition帧的pose（相对稳定的前向运动）
+    for i in range(condition_frames):
+        t = i / max(1, condition_frames - 1)  # 0 to 1
+        # condition帧保持相对稳定的前向运动
+        translation = [-t * 0.5, 0.0, 0.0]  # 缓慢前进
+        rotation = [1.0, 0.0, 0.0, 0.0]     # 无旋转
+        frame_type = 0.0  # condition
+        pose_vec = translation + rotation + [frame_type]  # 8D vector
+        poses.append(pose_vec)
+    # 🔧 生成target帧的pose（根据指定方向）
+    for i in range(target_frames):
+        t = i / max(1, target_frames - 1)  # 0 to 1
+        if direction == "forward":
+            # 前进：x负方向移动，无旋转
+            translation = [-(condition_frames * 0.5 + t * 2.0), 0.0, 0.0]
+            rotation = [1.0, 0.0, 0.0, 0.0]  # 单位四元数
+        elif direction == "backward":
+            # 后退：x正方向移动，无旋转
+            translation = [-(condition_frames * 0.5) + t * 2.0, 0.0, 0.0]
+            rotation = [1.0, 0.0, 0.0, 0.0]
+        elif direction == "left_turn":
+            # 左转：前进 + 绕z轴正向旋转
+            translation = [-(condition_frames * 0.5 + t * 1.5), t * 0.5, 0.0]  # 前进并稍微左移
+            yaw = t * 0.3  # 左转角度（弧度）
+            rotation = [
+                np.cos(yaw/2),  # w
+                0.0,            # x
+                0.0,            # y
+                np.sin(yaw/2)   # z (左转为正)
+            ]
+        elif direction == "right_turn":
+            # 右转：前进 + 绕z轴负向旋转
+            translation = [-(condition_frames * 0.5 + t * 1.5), -t * 0.5, 0.0]  # 前进并稍微右移
+            yaw = -t * 0.3  # 右转角度（弧度）
+            rotation = [
+                np.cos(abs(yaw)/2),  # w
+                0.0,                 # x
+                0.0,                 # y
+                np.sin(yaw/2)        # z (右转为负)
+            ]
+        else:
+            raise ValueError(f"Unknown direction: {direction}")
+        frame_type = 1.0  # target
+        pose_vec = translation + rotation + [frame_type]  # 8D vector
+        poses.append(pose_vec)
+    pose_sequence = torch.tensor(poses, dtype=torch.float32)
+    # 🔧 只对target部分进行分类（前7维，去掉frame type）
+    target_pose_sequence = pose_sequence[condition_frames:, :7]
+    # 🔧 使用增强的embedding生成方法
+    condition_classes = torch.full((condition_frames,), 0, dtype=torch.long)  # condition都是forward
+    target_classes = classifier.classify_pose_sequence(target_pose_sequence)
+    full_classes = torch.cat([condition_classes, target_classes], dim=0)
+    # 创建增强的embedding
+    class_embeddings = create_enhanced_class_embedding_for_inference(
+        full_classes, pose_sequence, embed_dim=512
+    )
+    print(f"Generated {direction} poses:")
+    print(f"  Total frames: {total_frames} (condition: {condition_frames}, target: {target_frames})")
+    analysis = classifier.analyze_pose_sequence(target_pose_sequence)
+    print(f"  Target class distribution: {analysis['class_distribution']}")
+    print(f"  Target motion segments: {len(analysis['motion_segments'])}")
+    return class_embeddings
+def create_enhanced_class_embedding_for_inference(class_labels: torch.Tensor, pose_sequence: torch.Tensor, embed_dim: int = 512) -> torch.Tensor:
+    """推理时创建增强的类别embedding"""
+    num_classes = 4
+    num_frames = len(class_labels)
+    # 基础的方向embedding
+    direction_vectors = torch.tensor([
+        [1.0, 0.0, 0.0, 0.0],  # forward
+        [-1.0, 0.0, 0.0, 0.0], # backward
+        [0.0, 1.0, 0.0, 0.0],  # left_turn
+        [0.0, -1.0, 0.0, 0.0], # right_turn
+    ], dtype=torch.float32)
+    # One-hot编码
+    one_hot = torch.zeros(num_frames, num_classes)
+    one_hot.scatter_(1, class_labels.unsqueeze(1), 1)
+    # 基于方向向量的基础embedding
+    base_embeddings = one_hot @ direction_vectors  # [num_frames, 4]
+    # 添加frame type信息
+    frame_types = pose_sequence[:, -1]  # 最后一维是frame type
+    frame_type_embeddings = torch.zeros(num_frames, 2)
+    frame_type_embeddings[:, 0] = (frame_types == 0).float()  # condition
+    frame_type_embeddings[:, 1] = (frame_types == 1).float()  # target
+    # 添加pose的几何信息
+    translations = pose_sequence[:, :3]  # [num_frames, 3]
+    rotations = pose_sequence[:, 3:7]    # [num_frames, 4]
+    # 组合所有特征
+    combined_features = torch.cat([
+        base_embeddings,         # [num_frames, 4]
+        frame_type_embeddings,   # [num_frames, 2]
+        translations,            # [num_frames, 3]
+        rotations,               # [num_frames, 4]
+    ], dim=1)  # [num_frames, 13]
+    # 扩展到目标维度
+    if embed_dim > 13:
+        expand_matrix = torch.randn(13, embed_dim) * 0.1
+        expand_matrix[:13, :13] = torch.eye(13)
+        embeddings = combined_features @ expand_matrix
+    else:
+        embeddings = combined_features[:, :embed_dim]
+    return embeddings
+def generate_poses_from_file(poses_path, target_frames=10):
+    """从poses.json文件生成类别embedding"""
+    classifier = PoseClassifier()
+    with open(poses_path, 'r') as f:
+        poses_data = json.load(f)
+    target_relative_poses = poses_data['target_relative_poses']
+    if not target_relative_poses:
+        print("No poses found in file, using forward direction")
+        return generate_direction_poses("forward", target_frames)
+    # 创建pose序列
+    pose_vecs = []
+    for i in range(target_frames):
+        if len(target_relative_poses) == 1:
+            pose_data = target_relative_poses[0]
+        else:
+            pose_idx = min(i * len(target_relative_poses) // target_frames,
+                         len(target_relative_poses) - 1)
+            pose_data = target_relative_poses[pose_idx]
+        # 提取相对位移和旋转
+        translation = torch.tensor(pose_data['relative_translation'], dtype=torch.float32)
+        current_rotation = torch.tensor(pose_data['current_rotation'], dtype=torch.float32)
+        reference_rotation = torch.tensor(pose_data['reference_rotation'], dtype=torch.float32)
+        # 计算相对旋转
+        relative_rotation = calculate_relative_rotation(current_rotation, reference_rotation)
+        # 组合为7D向量
+        pose_vec = torch.cat([translation, relative_rotation], dim=0)
+        pose_vecs.append(pose_vec)
+    pose_sequence = torch.stack(pose_vecs, dim=0)
+    # 使用分类器生成class embedding
+    class_embeddings = classifier.create_class_embedding(
+        classifier.classify_pose_sequence(pose_sequence),
+        embed_dim=512
+    )
+    print(f"Generated poses from file:")
+    analysis = classifier.analyze_pose_sequence(pose_sequence)
+    print(f"  Class distribution: {analysis['class_distribution']}")
+    print(f"  Motion segments: {len(analysis['motion_segments'])}")
+    return class_embeddings
+def inference_nuscenes_video(
+    condition_video_path,
+    dit_path,
+    text_encoder_path,
+    vae_path,
+    output_path="nus/infer_results/output_nuscenes.mp4",
+    condition_frames=20,
+    target_frames=3,
+    height=900,
+    width=1600,
+    device="cuda",
+    prompt="A car driving scene captured by front camera",
+    poses_path=None,
+    direction="forward"
+):
+    """
+    使用方向类别控制的推理函数 - 支持condition和target pose区分
+    """
+    os.makedirs(os.path.dirname(output_path),exist_ok=True)
+    print(f"Setting up models for {direction} movement...")
+    # 1. Load models (same as before)
+    model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+    model_manager.load_models([
+        "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+    ])
+    pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager, device="cuda")
+    # Add camera components to DiT
+    dim = pipe.dit.blocks[0].self_attn.q.weight.shape[0]
+    for block in pipe.dit.blocks:
+        block.cam_encoder = nn.Linear(512, dim)  # 保持512维embedding
+        block.projector = nn.Linear(dim, dim)
+        block.cam_encoder.weight.data.zero_()
+        block.cam_encoder.bias.data.zero_()
+        block.projector.weight = nn.Parameter(torch.eye(dim))
+        block.projector.bias = nn.Parameter(torch.zeros(dim))
+    # Load trained DiT weights
+    dit_state_dict = torch.load(dit_path, map_location="cpu")
+    pipe.dit.load_state_dict(dit_state_dict, strict=True)
+    pipe = pipe.to(device)
+    pipe.scheduler.set_timesteps(50)
+    print("Loading condition video...")
+    # Load condition video
+    condition_video = load_video_frames(
+        condition_video_path,
+        num_frames=condition_frames,
+        height=height,
+        width=width
+    )
+    if condition_video is None:
+        raise ValueError(f"Failed to load condition video from {condition_video_path}")
+    condition_video = condition_video.unsqueeze(0).to(device, dtype=pipe.torch_dtype)
+    print("Processing poses...")
+    # 🔧 修改：生成包含condition和target的pose embedding
+    print(f"Generating {direction} movement poses...")
+    camera_embedding = generate_direction_poses(
+        direction=direction,
+        target_frames=target_frames,
+        condition_frames=int(condition_frames/4)  # 压缩后的condition帧数
+    )
+    camera_embedding = camera_embedding.unsqueeze(0).to(device, dtype=torch.bfloat16)
+    print(f"Camera embedding shape: {camera_embedding.shape}")
+    print(f"Generated poses for direction: {direction}")
+    print("Encoding inputs...")
+    # Encode text prompt
+    prompt_emb = pipe.encode_prompt(prompt)
+    # Encode condition video
+    condition_latents = pipe.encode_video(condition_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16))[0]
+    print("Generating video...")
+    # Generate target latents
+    batch_size = 1
+    channels = condition_latents.shape[0]
+    latent_height = condition_latents.shape[2]
+    latent_width = condition_latents.shape[3]
+    target_height, target_width = 60, 104  # 根据你的需求调整
+    if latent_height > target_height or latent_width > target_width:
+        # 中心裁剪
+        h_start = (latent_height - target_height) // 2
+        w_start = (latent_width - target_width) // 2
+        condition_latents = condition_latents[:, :,
+                        h_start:h_start+target_height,
+                        w_start:w_start+target_width]
+    latent_height =   target_height
+    latent_width = target_width
+    condition_latents = condition_latents.to(device, dtype=pipe.torch_dtype)
+    condition_latents = condition_latents.unsqueeze(0)
+    condition_latents = condition_latents + 0.05 * torch.randn_like(condition_latents)  # 添加少量噪声以增加多样性
+    # Initialize target latents with noise
+    target_latents = torch.randn(
+        batch_size, channels, target_frames, latent_height, latent_width,
+        device=device, dtype=pipe.torch_dtype
+    )
+    print(target_latents.shape)
+    print(camera_embedding.shape)
+    # Combine condition and target latents
+    combined_latents = torch.cat([condition_latents, target_latents], dim=2)
+    print(combined_latents.shape)
+    # Prepare extra inputs
+    extra_input = pipe.prepare_extra_input(combined_latents)
+    # Denoising loop
+    timesteps = pipe.scheduler.timesteps
+    for i, timestep in enumerate(timesteps):
+        print(f"Denoising step {i+1}/{len(timesteps)}")
+        # Prepare timestep
+        timestep_tensor = timestep.unsqueeze(0).to(device, dtype=pipe.torch_dtype)
+        # Predict noise
+        with torch.no_grad():
+            noise_pred = pipe.dit(
+                combined_latents,
+                timestep=timestep_tensor,
+                cam_emb=camera_embedding,
+                **prompt_emb,
+                **extra_input
+            )
+        # Update only target part
+        target_noise_pred = noise_pred[:, :, int(condition_frames/4):, :, :]
+        target_latents = pipe.scheduler.step(target_noise_pred, timestep, target_latents)
+        # Update combined latents
+        combined_latents[:, :, int(condition_frames/4):, :, :] = target_latents
+    print("Decoding video...")
+    # Decode final video
+    final_video = torch.cat([condition_latents, target_latents], dim=2)
+    decoded_video = pipe.decode_video(final_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16))
+    # Save video
+    print(f"Saving video to {output_path}")
+    # Convert to numpy and save
+    video_np = decoded_video[0].to(torch.float32).permute(1, 2, 3, 0).cpu().numpy()  # 转换为 Float32
+    video_np = (video_np * 0.5 + 0.5).clip(0, 1)  # Denormalize
+    video_np = (video_np * 255).astype(np.uint8)
+    with imageio.get_writer(output_path, fps=20) as writer:
+        for frame in video_np:
+            writer.append_data(frame)
+    print(f"Video generation completed! Saved to {output_path}")
+def main():
+    parser = argparse.ArgumentParser(description="NuScenes Video Generation Inference with Direction Control")
+    parser.add_argument("--condition_video", type=str, default="/home/zhuyixuan05/ReCamMaster/nus/videos/4032/right.mp4",
+                       help="Path to condition video")
+    parser.add_argument("--direction", type=str, default="left_turn",
+                       choices=["forward", "backward", "left_turn", "right_turn"],
+                       help="Direction of camera movement")
+    parser.add_argument("--dit_path", type=str, default="/home/zhuyixuan05/ReCamMaster/nus_dynamic/step15000_dynamic.ckpt",
+                       help="Path to trained DiT checkpoint")
+    parser.add_argument("--text_encoder_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
+                       help="Path to text encoder")
+    parser.add_argument("--vae_path", type=str,
+                       default="models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+                       help="Path to VAE")
+    parser.add_argument("--output_path", type=str, default="nus/infer_results-15000/right_left.mp4",
+                       help="Output video path")
+    parser.add_argument("--poses_path", type=str, default=None,
+                       help="Path to poses.json file (optional, will use direction if not provided)")
+    parser.add_argument("--prompt", type=str,
+                       default="A car driving scene captured by front camera",
+                       help="Text prompt for generation")
+    parser.add_argument("--condition_frames", type=int, default=40,
+                       help="Number of condition frames")
+    # 这个是原始帧数
+    parser.add_argument("--target_frames", type=int, default=8,
+                       help="Number of target frames to generate")
+    # 这个要除以4
+    parser.add_argument("--height", type=int, default=900,
+                       help="Video height")
+    parser.add_argument("--width", type=int, default=1600,
+                       help="Video width")
+    parser.add_argument("--device", type=str, default="cuda",
+                       help="Device to run inference on")
+    args = parser.parse_args()
+    condition_video_path = args.condition_video
+    input_filename = os.path.basename(condition_video_path)
+    output_dir = "nus/infer_results"
+    os.makedirs(output_dir, exist_ok=True)
+    # 🔧 修改：在输出文件名中包含方向信息
+    if args.output_path is None:
+        name_parts = os.path.splitext(input_filename)
+        output_filename = f"{name_parts[0]}_{args.direction}{name_parts[1]}"
+        output_path = os.path.join(output_dir, output_filename)
+    else:
+        output_path = args.output_path
+    print(f"Output video will be saved to: {output_path}")
+    inference_nuscenes_video(
+        condition_video_path=args.condition_video,
+        dit_path=args.dit_path,
+        text_encoder_path=args.text_encoder_path,
+        vae_path=args.vae_path,
+        output_path=output_path,
+        condition_frames=args.condition_frames,
+        target_frames=args.target_frames,
+        height=args.height,
+        width=args.width,
+        device=args.device,
+        prompt=args.prompt,
+        poses_path=args.poses_path,
+        direction=args.direction  # 🔧 新增
+    )
+if __name__ == "__main__":
+    main()

scripts/infer_openx.py ADDED Viewed

	@@ -0,0 +1,614 @@

+from PIL import Image
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+from torchvision.transforms import v2
+from einops import rearrange
+import os
+import torch
+import torch.nn as nn
+import argparse
+import numpy as np
+import imageio
+import copy
+import random
+def load_encoded_video_from_pth(pth_path, start_frame=0, num_frames=10):
+    """从pth文件加载预编码的视频数据"""
+    print(f"Loading encoded video from {pth_path}")
+    encoded_data = torch.load(pth_path, weights_only=False, map_location="cpu")
+    full_latents = encoded_data['latents']  # [C, T, H, W]
+    print(f"Full latents shape: {full_latents.shape}")
+    print(f"Extracting frames {start_frame} to {start_frame + num_frames}")
+    if start_frame + num_frames > full_latents.shape[1]:
+        raise ValueError(f"Not enough frames: requested {start_frame + num_frames}, available {full_latents.shape[1]}")
+    condition_latents = full_latents[:, start_frame:start_frame + num_frames, :, :]
+    print(f"Extracted condition latents shape: {condition_latents.shape}")
+    return condition_latents, encoded_data
+def compute_relative_pose(pose_a, pose_b, use_torch=False):
+    """计算相机B相对于相机A的相对位姿矩阵"""
+    assert pose_a.shape == (4, 4), f"相机A外参矩阵形状应为(4,4)，实际为{pose_a.shape}"
+    assert pose_b.shape == (4, 4), f"相机B外参矩阵形状应为(4,4)，实际为{pose_b.shape}"
+    if use_torch:
+        if not isinstance(pose_a, torch.Tensor):
+            pose_a = torch.from_numpy(pose_a).float()
+        if not isinstance(pose_b, torch.Tensor):
+            pose_b = torch.from_numpy(pose_b).float()
+        pose_a_inv = torch.inverse(pose_a)
+        relative_pose = torch.matmul(pose_b, pose_a_inv)
+    else:
+        if not isinstance(pose_a, np.ndarray):
+            pose_a = np.array(pose_a, dtype=np.float32)
+        if not isinstance(pose_b, np.ndarray):
+            pose_b = np.array(pose_b, dtype=np.float32)
+        pose_a_inv = np.linalg.inv(pose_a)
+        relative_pose = np.matmul(pose_b, pose_a_inv)
+    return relative_pose
+def replace_dit_model_in_manager():
+    """在模型加载前替换DiT模型类"""
+    from diffsynth.models.wan_video_dit_recam_future import WanModelFuture
+    from diffsynth.configs.model_config import model_loader_configs
+    # 修改model_loader_configs中的配置
+    for i, config in enumerate(model_loader_configs):
+        keys_hash, keys_hash_with_shape, model_names, model_classes, model_resource = config
+        # 检查是否包含wan_video_dit模型
+        if 'wan_video_dit' in model_names:
+            # 找到wan_video_dit的索引并替换为WanModelFuture
+            new_model_names = []
+            new_model_classes = []
+            for name, cls in zip(model_names, model_classes):
+                if name == 'wan_video_dit':
+                    new_model_names.append(name)  # 保持名称不变
+                    new_model_classes.append(WanModelFuture)  # 替换为新的类
+                    print(f"✅ 替换了模型类: {name} -> WanModelFuture")
+                else:
+                    new_model_names.append(name)
+                    new_model_classes.append(cls)
+            # 更新配置
+            model_loader_configs[i] = (keys_hash, keys_hash_with_shape, new_model_names, new_model_classes, model_resource)
+def add_framepack_components(dit_model):
+    """添加FramePack相关组件"""
+    if not hasattr(dit_model, 'clean_x_embedder'):
+        inner_dim = dit_model.blocks[0].self_attn.q.weight.shape[0]
+        class CleanXEmbedder(nn.Module):
+            def __init__(self, inner_dim):
+                super().__init__()
+                # 参考hunyuan_video_packed.py的设计
+                self.proj = nn.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
+                self.proj_2x = nn.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
+                self.proj_4x = nn.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
+            def forward(self, x, scale="1x"):
+                if scale == "1x":
+                    return self.proj(x)
+                elif scale == "2x":
+                    return self.proj_2x(x)
+                elif scale == "4x":
+                    return self.proj_4x(x)
+                else:
+                    raise ValueError(f"Unsupported scale: {scale}")
+        dit_model.clean_x_embedder = CleanXEmbedder(inner_dim)
+        model_dtype = next(dit_model.parameters()).dtype
+        dit_model.clean_x_embedder = dit_model.clean_x_embedder.to(dtype=model_dtype)
+        print("✅ 添加了FramePack的clean_x_embedder组件")
+def generate_openx_camera_embeddings_sliding(cam_data, start_frame, current_history_length, new_frames, total_generated, use_real_poses=True):
+    """为OpenX数据集生成camera embeddings - 滑动窗口版本"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if use_real_poses and cam_data is not None and 'extrinsic' in cam_data:
+        print("🔧 使用真实OpenX camera数据")
+        cam_extrinsic = cam_data['extrinsic']
+        # 确保生成足够长的camera序列
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 计算OpenX camera序列长度:")
+        print(f"  - 基础需求: {start_frame + current_history_length + new_frames}")
+        print(f"  - FramePack需求: {framepack_needed_frames}")
+        print(f"  - 最终生成: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # OpenX特有：每隔4帧
+            frame_idx = i * time_compression_ratio
+            next_frame_idx = frame_idx + time_compression_ratio
+            if next_frame_idx < len(cam_extrinsic):
+                cam_prev = cam_extrinsic[frame_idx]
+                cam_next = cam_extrinsic[next_frame_idx]
+                relative_cam = compute_relative_pose(cam_prev, cam_next)
+                relative_poses.append(torch.as_tensor(relative_cam[:3, :]))
+            else:
+                # 超出范围，使用零运动
+                print(f"⚠️ 帧{frame_idx}超出camera数据范围，使用零运动")
+                relative_poses.append(torch.zeros(3, 4))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        # 从start_frame到current_history_length标记为condition
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 OpenX真实camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用OpenX合成camera数据")
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 生成OpenX合成camera帧数: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # OpenX机器人操作模式 - 稳定的小幅度运动
+            # 模拟机器人手臂的精细操作
+            forward_speed = 0.001  # 每帧前进距离（很小，因为是精细操作）
+            lateral_motion = 0.0005 * np.sin(i * 0.05)  # 轻微的左右移动
+            vertical_motion = 0.0003 * np.cos(i * 0.1)  # 轻微的上下移动
+            # 旋转变化（模拟视角微调）
+            yaw_change = 0.01 * np.sin(i * 0.03)  # 轻微的偏航
+            pitch_change = 0.008 * np.cos(i * 0.04)  # 轻微的俯仰
+            pose = np.eye(4, dtype=np.float32)
+            # 旋转矩阵（绕Y轴和X轴的小角度旋转）
+            cos_yaw = np.cos(yaw_change)
+            sin_yaw = np.sin(yaw_change)
+            cos_pitch = np.cos(pitch_change)
+            sin_pitch = np.sin(pitch_change)
+            # 组合旋转（先pitch后yaw）
+            pose[0, 0] = cos_yaw
+            pose[0, 2] = sin_yaw
+            pose[1, 1] = cos_pitch
+            pose[1, 2] = -sin_pitch
+            pose[2, 0] = -sin_yaw
+            pose[2, 1] = sin_pitch
+            pose[2, 2] = cos_yaw * cos_pitch
+            # 平移（精细操作的小幅度移动）
+            pose[0, 3] = lateral_motion  # X轴（左右）
+            pose[1, 3] = vertical_motion  # Y轴（上下）
+            pose[2, 3] = -forward_speed  # Z轴（前后，负值表示前进）
+            relative_pose = pose[:3, :]
+            relative_poses.append(torch.as_tensor(relative_pose))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 OpenX合成camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def prepare_framepack_sliding_window_with_camera(history_latents, target_frames_to_generate, camera_embedding_full, start_frame, max_history_frames=49):
+    """FramePack滑动��口机制 - OpenX版本"""
+    # history_latents: [C, T, H, W] 当前的历史latents
+    C, T, H, W = history_latents.shape
+    # 固定索引结构（这决定了需要的camera帧数）
+    total_indices_length = 1 + 16 + 2 + 1 + target_frames_to_generate
+    indices = torch.arange(0, total_indices_length)
+    split_sizes = [1, 16, 2, 1, target_frames_to_generate]
+    clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = \
+        indices.split(split_sizes, dim=0)
+    clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=0)
+    # 检查camera长度是否足够
+    if camera_embedding_full.shape[0] < total_indices_length:
+        shortage = total_indices_length - camera_embedding_full.shape[0]
+        padding = torch.zeros(shortage, camera_embedding_full.shape[1],
+                            dtype=camera_embedding_full.dtype, device=camera_embedding_full.device)
+        camera_embedding_full = torch.cat([camera_embedding_full, padding], dim=0)
+    # 从完整camera序列中选取对应部分
+    combined_camera = camera_embedding_full[:total_indices_length, :].clone()
+    # 根据当前history length重新设置mask
+    combined_camera[:, -1] = 0.0  # 先全部设为target (0)
+    # 设置condition mask：前19帧根据实际历史长度决定
+    if T > 0:
+        available_frames = min(T, 19)
+        start_pos = 19 - available_frames
+        combined_camera[start_pos:19, -1] = 1.0  # 将有效的clean latents对应的camera标记为condition
+    print(f"🔧 OpenX Camera mask更新:")
+    print(f"  - 历史帧数: {T}")
+    print(f"  - 有效condition帧数: {available_frames if T > 0 else 0}")
+    # 处理latents
+    clean_latents_combined = torch.zeros(C, 19, H, W, dtype=history_latents.dtype, device=history_latents.device)
+    if T > 0:
+        available_frames = min(T, 19)
+        start_pos = 19 - available_frames
+        clean_latents_combined[:, start_pos:, :, :] = history_latents[:, -available_frames:, :, :]
+    clean_latents_4x = clean_latents_combined[:, 0:16, :, :]
+    clean_latents_2x = clean_latents_combined[:, 16:18, :, :]
+    clean_latents_1x = clean_latents_combined[:, 18:19, :, :]
+    if T > 0:
+        start_latent = history_latents[:, 0:1, :, :]
+    else:
+        start_latent = torch.zeros(C, 1, H, W, dtype=history_latents.dtype, device=history_latents.device)
+    clean_latents = torch.cat([start_latent, clean_latents_1x], dim=1)
+    return {
+        'latent_indices': latent_indices,
+        'clean_latents': clean_latents,
+        'clean_latents_2x': clean_latents_2x,
+        'clean_latents_4x': clean_latents_4x,
+        'clean_latent_indices': clean_latent_indices,
+        'clean_latent_2x_indices': clean_latent_2x_indices,
+        'clean_latent_4x_indices': clean_latent_4x_indices,
+        'camera_embedding': combined_camera,
+        'current_length': T,
+        'next_length': T + target_frames_to_generate
+    }
+def inference_openx_framepack_sliding_window(
+    condition_pth_path,
+    dit_path,
+    output_path="openx_results/output_openx_framepack_sliding.mp4",
+    start_frame=0,
+    initial_condition_frames=8,
+    frames_per_generation=4,
+    total_frames_to_generate=32,
+    max_history_frames=49,
+    device="cuda",
+    prompt="A video of robotic manipulation task with camera movement",
+    use_real_poses=True,
+    # CFG参数
+    use_camera_cfg=True,
+    camera_guidance_scale=2.0,
+    text_guidance_scale=1.0
+):
+    """
+    OpenX FramePack滑动窗口视频生成
+    """
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    print(f"🔧 OpenX FramePack滑动窗口生成开始...")
+    print(f"Camera CFG: {use_camera_cfg}, Camera guidance scale: {camera_guidance_scale}")
+    print(f"Text guidance scale: {text_guidance_scale}")
+    # 1. 模型初始化
+    replace_dit_model_in_manager()
+    model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+    model_manager.load_models([
+        "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+    ])
+    pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager, device="cuda")
+    # 2. 添加camera编码器
+    dim = pipe.dit.blocks[0].self_attn.q.weight.shape[0]
+    for block in pipe.dit.blocks:
+        block.cam_encoder = nn.Linear(13, dim)
+        block.projector = nn.Linear(dim, dim)
+        block.cam_encoder.weight.data.zero_()
+        block.cam_encoder.bias.data.zero_()
+        block.projector.weight = nn.Parameter(torch.eye(dim))
+        block.projector.bias = nn.Parameter(torch.zeros(dim))
+    # 3. 添加FramePack组件
+    add_framepack_components(pipe.dit)
+    # 4. 加载训练好的权重
+    dit_state_dict = torch.load(dit_path, map_location="cpu")
+    pipe.dit.load_state_dict(dit_state_dict, strict=True)
+    pipe = pipe.to(device)
+    model_dtype = next(pipe.dit.parameters()).dtype
+    if hasattr(pipe.dit, 'clean_x_embedder'):
+        pipe.dit.clean_x_embedder = pipe.dit.clean_x_embedder.to(dtype=model_dtype)
+    pipe.scheduler.set_timesteps(50)
+    # 5. 加载初始条件
+    print("Loading initial condition frames...")
+    initial_latents, encoded_data = load_encoded_video_from_pth(
+        condition_pth_path,
+        start_frame=start_frame,
+        num_frames=initial_condition_frames
+    )
+    # 空间裁剪（适配OpenX数据尺寸）
+    target_height, target_width = 60, 104
+    C, T, H, W = initial_latents.shape
+    if H > target_height or W > target_width:
+        h_start = (H - target_height) // 2
+        w_start = (W - target_width) // 2
+        initial_latents = initial_latents[:, :, h_start:h_start+target_height, w_start:w_start+target_width]
+        H, W = target_height, target_width
+    history_latents = initial_latents.to(device, dtype=model_dtype)
+    print(f"初始history_latents shape: {history_latents.shape}")
+    # 6. 编码prompt - 支持CFG
+    if text_guidance_scale > 1.0:
+        prompt_emb_pos = pipe.encode_prompt(prompt)
+        prompt_emb_neg = pipe.encode_prompt("")
+        print(f"使用Text CFG，guidance scale: {text_guidance_scale}")
+    else:
+        prompt_emb_pos = pipe.encode_prompt(prompt)
+        prompt_emb_neg = None
+        print("不使用Text CFG")
+    # 7. 预生成完整的camera embedding序列
+    camera_embedding_full = generate_openx_camera_embeddings_sliding(
+        encoded_data.get('cam_emb', None),
+        0,
+        max_history_frames,
+        0,
+        0,
+        use_real_poses=use_real_poses
+    ).to(device, dtype=model_dtype)
+    print(f"完整camera序列shape: {camera_embedding_full.shape}")
+    # 8. 为Camera CFG创建无条件的camera embedding
+    if use_camera_cfg:
+        camera_embedding_uncond = torch.zeros_like(camera_embedding_full)
+        print(f"创建无条件camera embedding用于CFG")
+    # 9. 滑动窗口生成循环
+    total_generated = 0
+    all_generated_frames = []
+    while total_generated < total_frames_to_generate:
+        current_generation = min(frames_per_generation, total_frames_to_generate - total_generated)
+        print(f"\n🔧 生成步骤 {total_generated // frames_per_generation + 1}")
+        print(f"当前历史长度: {history_latents.shape[1]}, 本次生成: {current_generation}")
+        # FramePack数据准备 - OpenX版本
+        framepack_data = prepare_framepack_sliding_window_with_camera(
+            history_latents,
+            current_generation,
+            camera_embedding_full,
+            start_frame,
+            max_history_frames
+        )
+        # 准备输入
+        clean_latents = framepack_data['clean_latents'].unsqueeze(0)
+        clean_latents_2x = framepack_data['clean_latents_2x'].unsqueeze(0)
+        clean_latents_4x = framepack_data['clean_latents_4x'].unsqueeze(0)
+        camera_embedding = framepack_data['camera_embedding'].unsqueeze(0)
+        # 为CFG准备无条件camera embedding
+        if use_camera_cfg:
+            camera_embedding_uncond_batch = camera_embedding_uncond[:camera_embedding.shape[1], :].unsqueeze(0)
+        # 索引处理
+        latent_indices = framepack_data['latent_indices'].unsqueeze(0).cpu()
+        clean_latent_indices = framepack_data['clean_latent_indices'].unsqueeze(0).cpu()
+        clean_latent_2x_indices = framepack_data['clean_latent_2x_indices'].unsqueeze(0).cpu()
+        clean_latent_4x_indices = framepack_data['clean_latent_4x_indices'].unsqueeze(0).cpu()
+        # 初始化要生成的latents
+        new_latents = torch.randn(
+            1, C, current_generation, H, W,
+            device=device, dtype=model_dtype
+        )
+        extra_input = pipe.prepare_extra_input(new_latents)
+        print(f"Camera embedding shape: {camera_embedding.shape}")
+        print(f"Camera mask分布 - condition: {torch.sum(camera_embedding[0, :, -1] == 1.0).item()}, target: {torch.sum(camera_embedding[0, :, -1] == 0.0).item()}")
+        # 去噪循环 - 支持CFG
+        timesteps = pipe.scheduler.timesteps
+        for i, timestep in enumerate(timesteps):
+            if i % 10 == 0:
+                print(f"  去噪步骤 {i}/{len(timesteps)}")
+            timestep_tensor = timestep.unsqueeze(0).to(device, dtype=model_dtype)
+            with torch.no_grad():
+                # 正向预测（带条件）
+                noise_pred_pos = pipe.dit(
+                    new_latents,
+                    timestep=timestep_tensor,
+                    cam_emb=camera_embedding,
+                    latent_indices=latent_indices,
+                    clean_latents=clean_latents,
+                    clean_latent_indices=clean_latent_indices,
+                    clean_latents_2x=clean_latents_2x,
+                    clean_latent_2x_indices=clean_latent_2x_indices,
+                    clean_latents_4x=clean_latents_4x,
+                    clean_latent_4x_indices=clean_latent_4x_indices,
+                    **prompt_emb_pos,
+                    **extra_input
+                )
+                # CFG处理
+                if use_camera_cfg and camera_guidance_scale > 1.0:
+                    # 无条件预测（无camera条件）
+                    noise_pred_uncond = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding_uncond_batch,
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+                    # Camera CFG
+                    noise_pred = noise_pred_uncond + camera_guidance_scale * (noise_pred_pos - noise_pred_uncond)
+                else:
+                    noise_pred = noise_pred_pos
+                # Text CFG
+                if prompt_emb_neg is not None and text_guidance_scale > 1.0:
+                    noise_pred_text_uncond = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_neg,
+                        **extra_input
+                    )
+                    # Text CFG
+                    noise_pred = noise_pred_text_uncond + text_guidance_scale * (noise_pred - noise_pred_text_uncond)
+            new_latents = pipe.scheduler.step(noise_pred, timestep, new_latents)
+        # 更新历史
+        new_latents_squeezed = new_latents.squeeze(0)
+        history_latents = torch.cat([history_latents, new_latents_squeezed], dim=1)
+        # 维护滑动窗口
+        if history_latents.shape[1] > max_history_frames:
+            first_frame = history_latents[:, 0:1, :, :]
+            recent_frames = history_latents[:, -(max_history_frames-1):, :, :]
+            history_latents = torch.cat([first_frame, recent_frames], dim=1)
+            print(f"历史窗口已满，保留第一帧+最新{max_history_frames-1}帧")
+        print(f"更新后history_latents shape: {history_latents.shape}")
+        all_generated_frames.append(new_latents_squeezed)
+        total_generated += current_generation
+        print(f"✅ 已生成 {total_generated}/{total_frames_to_generate} 帧")
+    # 10. 解码和保存
+    print("\n🔧 解码生成的视频...")
+    all_generated = torch.cat(all_generated_frames, dim=1)
+    final_video = torch.cat([initial_latents.to(all_generated.device), all_generated], dim=1).unsqueeze(0)
+    print(f"最终视频shape: {final_video.shape}")
+    decoded_video = pipe.decode_video(final_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16))
+    print(f"Saving video to {output_path}")
+    video_np = decoded_video[0].to(torch.float32).permute(1, 2, 3, 0).cpu().numpy()
+    video_np = (video_np * 0.5 + 0.5).clip(0, 1)
+    video_np = (video_np * 255).astype(np.uint8)
+    with imageio.get_writer(output_path, fps=20) as writer:
+        for frame in video_np:
+            writer.append_data(frame)
+    print(f"🔧 OpenX FramePack滑动窗口生成完成! 保存到: {output_path}")
+    print(f"总共生成了 {total_generated} 帧 (压缩后), 对应原始 {total_generated * 4} 帧")
+def main():
+    parser = argparse.ArgumentParser(description="OpenX FramePack滑动窗口视频生成")
+    # 基础参数
+    parser.add_argument("--condition_pth", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/openx-fractal-encoded/episode_000001/encoded_video.pth",
+                       help="输入编码视频路径")
+    parser.add_argument("--start_frame", type=int, default=0)
+    parser.add_argument("--initial_condition_frames", type=int, default=16)
+    parser.add_argument("--frames_per_generation", type=int, default=8)
+    parser.add_argument("--total_frames_to_generate", type=int, default=24)
+    parser.add_argument("--max_history_frames", type=int, default=100)
+    parser.add_argument("--use_real_poses", action="store_true", default=False)
+    parser.add_argument("--dit_path", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/ICLR2026/openx/openx_framepack/step2000.ckpt",
+                       help="训练好的模型权重路径")
+    parser.add_argument("--output_path", type=str,
+                       default='openx_results/output_openx_framepack_sliding.mp4')
+    parser.add_argument("--prompt", type=str,
+                       default="A video of robotic manipulation task with camera movement")
+    parser.add_argument("--device", type=str, default="cuda")
+    # CFG参数
+    parser.add_argument("--use_camera_cfg", action="store_true", default=True,
+                       help="使用Camera CFG")
+    parser.add_argument("--camera_guidance_scale", type=float, default=2.0,
+                       help="Camera guidance scale for CFG")
+    parser.add_argument("--text_guidance_scale", type=float, default=1.0,
+                       help="Text guidance scale for CFG")
+    args = parser.parse_args()
+    print(f"🔧 OpenX FramePack CFG生成设置:")
+    print(f"Camera CFG: {args.use_camera_cfg}")
+    if args.use_camera_cfg:
+        print(f"Camera guidance scale: {args.camera_guidance_scale}")
+    print(f"Text guidance scale: {args.text_guidance_scale}")
+    print(f"OpenX特有特性: camera间隔为4帧，适用于机器人操作任务")
+    inference_openx_framepack_sliding_window(
+        condition_pth_path=args.condition_pth,
+        dit_path=args.dit_path,
+        output_path=args.output_path,
+        start_frame=args.start_frame,
+        initial_condition_frames=args.initial_condition_frames,
+        frames_per_generation=args.frames_per_generation,
+        total_frames_to_generate=args.total_frames_to_generate,
+        max_history_frames=args.max_history_frames,
+        device=args.device,
+        prompt=args.prompt,
+        use_real_poses=args.use_real_poses,
+        # CFG参数
+        use_camera_cfg=args.use_camera_cfg,
+        camera_guidance_scale=args.camera_guidance_scale,
+        text_guidance_scale=args.text_guidance_scale
+    )
+if __name__ == "__main__":
+    main()

scripts/infer_origin.py ADDED Viewed

	@@ -0,0 +1,1108 @@

+import os
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+import imageio
+import json
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import argparse
+from torchvision.transforms import v2
+from einops import rearrange
+import copy
+def compute_relative_pose_matrix(pose1, pose2):
+    """
+    计算相邻两帧的相对位姿，返回3×4的相机矩阵 [R_rel | t_rel]
+    参数:
+    pose1: 第i帧的相机位姿，形状为(7,)的数组 [tx1, ty1, tz1, qx1, qy1, qz1, qw1]
+    pose2: 第i+1帧的相机位姿，形状为(7,)的数组 [tx2, ty2, tz2, qx2, qy2, qz2, qw2]
+    返回:
+    relative_matrix: 3×4的相对位姿矩阵，前3列是旋转矩阵R_rel，第4列是平移向量t_rel
+    """
+    # 分离平移向量和四元数
+    t1 = pose1[:3]  # 第i帧平移 [tx1, ty1, tz1]
+    q1 = pose1[3:]  # 第i帧四元数 [qx1, qy1, qz1, qw1]
+    t2 = pose2[:3]  # 第i+1帧平移
+    q2 = pose2[3:]  # 第i+1帧四元数
+    # 1. 计算相对旋转矩阵 R_rel
+    rot1 = R.from_quat(q1)  # 第i帧旋转
+    rot2 = R.from_quat(q2)  # 第i+1帧旋转
+    rot_rel = rot2 * rot1.inv()  # 相对旋转 = 后一帧旋转 × 前一帧旋转的逆
+    R_rel = rot_rel.as_matrix()  # 转换为3×3矩阵
+    # 2. 计算相对平移向量 t_rel
+    R1_T = rot1.as_matrix().T  # 前一帧旋转矩阵的转置（等价于逆）
+    t_rel = R1_T @ (t2 - t1)   # 相对平移 = R1^T × (t2 - t1)
+    # 3. 组合为3×4矩阵 [R_rel | t_rel]
+    relative_matrix = np.hstack([R_rel, t_rel.reshape(3, 1)])
+    return relative_matrix
+def load_encoded_video_from_pth(pth_path, start_frame=0, num_frames=10):
+    """从pth文件加载预编码的视频数据"""
+    print(f"Loading encoded video from {pth_path}")
+    encoded_data = torch.load(pth_path, weights_only=False, map_location="cpu")
+    full_latents = encoded_data['latents']  # [C, T, H, W]
+    print(f"Full latents shape: {full_latents.shape}")
+    print(f"Extracting frames {start_frame} to {start_frame + num_frames}")
+    if start_frame + num_frames > full_latents.shape[1]:
+        raise ValueError(f"Not enough frames: requested {start_frame + num_frames}, available {full_latents.shape[1]}")
+    condition_latents = full_latents[:, start_frame:start_frame + num_frames, :, :]
+    print(f"Extracted condition latents shape: {condition_latents.shape}")
+    return condition_latents, encoded_data
+def compute_relative_pose(pose_a, pose_b, use_torch=False):
+    """计算相机B相对于相机A的相对位姿矩阵"""
+    assert pose_a.shape == (4, 4), f"相机A外参矩阵形状应为(4,4)，实际为{pose_a.shape}"
+    assert pose_b.shape == (4, 4), f"相机B外参矩阵形状应为(4,4)，实际为{pose_b.shape}"
+    if use_torch:
+        if not isinstance(pose_a, torch.Tensor):
+            pose_a = torch.from_numpy(pose_a).float()
+        if not isinstance(pose_b, torch.Tensor):
+            pose_b = torch.from_numpy(pose_b).float()
+        pose_a_inv = torch.inverse(pose_a)
+        relative_pose = torch.matmul(pose_b, pose_a_inv)
+    else:
+        if not isinstance(pose_a, np.ndarray):
+            pose_a = np.array(pose_a, dtype=np.float32)
+        if not isinstance(pose_b, np.ndarray):
+            pose_b = np.array(pose_b, dtype=np.float32)
+        pose_a_inv = np.linalg.inv(pose_a)
+        relative_pose = np.matmul(pose_b, pose_a_inv)
+    return relative_pose
+def replace_dit_model_in_manager():
+    """替换DiT模型类为MoE版本"""
+    from diffsynth.models.wan_video_dit_moe import WanModelMoe
+    from diffsynth.configs.model_config import model_loader_configs
+    for i, config in enumerate(model_loader_configs):
+        keys_hash, keys_hash_with_shape, model_names, model_classes, model_resource = config
+        if 'wan_video_dit' in model_names:
+            new_model_names = []
+            new_model_classes = []
+            for name, cls in zip(model_names, model_classes):
+                if name == 'wan_video_dit':
+                    new_model_names.append(name)
+                    new_model_classes.append(WanModelMoe)
+                    print(f"✅ 替换了模型类: {name} -> WanModelMoe")
+                else:
+                    new_model_names.append(name)
+                    new_model_classes.append(cls)
+            model_loader_configs[i] = (keys_hash, keys_hash_with_shape, new_model_names, new_model_classes, model_resource)
+def add_framepack_components(dit_model):
+    """添加FramePack相关组件"""
+    if not hasattr(dit_model, 'clean_x_embedder'):
+        inner_dim = dit_model.blocks[0].self_attn.q.weight.shape[0]
+        class CleanXEmbedder(nn.Module):
+            def __init__(self, inner_dim):
+                super().__init__()
+                self.proj = nn.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
+                self.proj_2x = nn.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
+                self.proj_4x = nn.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
+            def forward(self, x, scale="1x"):
+                if scale == "1x":
+                    x = x.to(self.proj.weight.dtype)
+                    return self.proj(x)
+                elif scale == "2x":
+                    x = x.to(self.proj_2x.weight.dtype)
+                    return self.proj_2x(x)
+                elif scale == "4x":
+                    x = x.to(self.proj_4x.weight.dtype)
+                    return self.proj_4x(x)
+                else:
+                    raise ValueError(f"Unsupported scale: {scale}")
+        dit_model.clean_x_embedder = CleanXEmbedder(inner_dim)
+        model_dtype = next(dit_model.parameters()).dtype
+        dit_model.clean_x_embedder = dit_model.clean_x_embedder.to(dtype=model_dtype)
+        print("✅ 添加了FramePack的clean_x_embedder组件")
+def add_moe_components(dit_model, moe_config):
+    """🔧 添加MoE相关组件 - 修正版本"""
+    if not hasattr(dit_model, 'moe_config'):
+        dit_model.moe_config = moe_config
+        print("✅ 添加了MoE配置到模型")
+    dit_model.top_k = moe_config.get("top_k", 1)
+    # 为每个block动态添加MoE组件
+    dim = dit_model.blocks[0].self_attn.q.weight.shape[0]
+    unified_dim = moe_config.get("unified_dim", 25)
+    num_experts = moe_config.get("num_experts", 4)
+    from diffsynth.models.wan_video_dit_moe import ModalityProcessor, MultiModalMoE
+    dit_model.sekai_processor = ModalityProcessor("sekai", 13, unified_dim)
+    dit_model.nuscenes_processor = ModalityProcessor("nuscenes", 8, unified_dim)
+    dit_model.openx_processor = ModalityProcessor("openx", 13, unified_dim)  # OpenX使用13维输入，类似sekai但独立处理
+    dit_model.global_router = nn.Linear(unified_dim, num_experts)
+    for i, block in enumerate(dit_model.blocks):
+        # MoE网络 - 输入unified_dim，输出dim
+        block.moe = MultiModalMoE(
+            unified_dim=unified_dim,
+            output_dim=dim,  # 输出维度匹配transformer block的dim
+            num_experts=moe_config.get("num_experts", 4),
+            top_k=moe_config.get("top_k", 2)
+        )
+        print(f"✅ Block {i} 添加了MoE组件 (unified_dim: {unified_dim}, experts: {moe_config.get('num_experts', 4)})")
+def generate_sekai_camera_embeddings_sliding(cam_data, start_frame, current_history_length, new_frames, total_generated, use_real_poses=True,direction="left"):
+    """为Sekai数据集生成camera embeddings - 滑动窗口版本"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if use_real_poses and cam_data is not None and 'extrinsic' in cam_data:
+        print("🔧 使用真实Sekai camera数据")
+        cam_extrinsic = cam_data['extrinsic']
+        # 确保生成足够长的camera序列
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 计算Sekai camera序列长度:")
+        print(f"  - 基础需求: {start_frame + current_history_length + new_frames}")
+        print(f"  - FramePack需求: {framepack_needed_frames}")
+        print(f"  - 最终生成: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # 计算当前帧在原始序列中的位置
+            frame_idx = i * time_compression_ratio
+            next_frame_idx = frame_idx + time_compression_ratio
+            if next_frame_idx < len(cam_extrinsic):
+                cam_prev = cam_extrinsic[frame_idx]
+                cam_next = cam_extrinsic[next_frame_idx]
+                relative_pose = compute_relative_pose(cam_prev, cam_next)
+                relative_poses.append(torch.as_tensor(relative_pose[:3, :]))
+            else:
+                # 超出范围，使用零运动
+                print(f"⚠️ 帧{frame_idx}超出camera数据范围，使用零运动")
+                relative_poses.append(torch.zeros(3, 4))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        # 从start_frame到current_history_length标记为condition
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 Sekai真实camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        if direction=="left":
+            print("-----Left-------")
+            max_needed_frames = max(
+                start_frame + current_history_length + new_frames,
+                framepack_needed_frames,
+                30
+            )
+            print(f"🔧 生成Sekai合成camera帧数: {max_needed_frames}")
+            relative_poses = []
+            for i in range(max_needed_frames):
+                # 持续左转运动模式
+                yaw_per_frame = 0.05  # 每帧左转（正角度表示左转）
+                forward_speed = 0.05  # 每帧前进距离
+                pose = np.eye(4, dtype=np.float32)
+                # 旋转矩阵（绕Y轴左转）
+                cos_yaw = np.cos(yaw_per_frame)
+                sin_yaw = np.sin(yaw_per_frame)
+                pose[0, 0] = cos_yaw
+                pose[0, 2] = sin_yaw
+                pose[2, 0] = -sin_yaw
+                pose[2, 2] = cos_yaw
+                # 平移（在旋转后的局部坐标系中前进）
+                pose[2, 3] = -forward_speed  # 局部Z轴负方向（前进）
+                # 添加轻微的向心运动，模拟圆形轨迹
+                radius_drift = 0.002  # 向圆心的轻微漂移
+                pose[0, 3] = -radius_drift  # 局部X轴负方向（向左）
+                relative_pose = pose[:3, :]
+                relative_poses.append(torch.as_tensor(relative_pose))
+            pose_embedding = torch.stack(relative_poses, dim=0)
+            pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+            # 创建对应长度的mask序列
+            mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+            condition_end = min(start_frame + current_history_length, max_needed_frames)
+            mask[start_frame:condition_end] = 1.0
+            camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+            print(f"🔧 Sekai合成camera embedding shape: {camera_embedding.shape}")
+            return camera_embedding.to(torch.bfloat16)
+        elif direction=="right":
+            print("------------Right----------")
+            max_needed_frames = max(
+                start_frame + current_history_length + new_frames,
+                framepack_needed_frames,
+                30
+            )
+            print(f"🔧 生成Sekai合成camera帧数: {max_needed_frames}")
+            relative_poses = []
+            for i in range(max_needed_frames):
+                # 持续左转运动模式
+                yaw_per_frame = -0.00  # 每帧左转（正角度表示左转）
+                forward_speed = 0.1  # 每帧前进距离
+                pose = np.eye(4, dtype=np.float32)
+                # 旋转矩阵（绕Y轴左转）
+                cos_yaw = np.cos(yaw_per_frame)
+                sin_yaw = np.sin(yaw_per_frame)
+                pose[0, 0] = cos_yaw
+                pose[0, 2] = sin_yaw
+                pose[2, 0] = -sin_yaw
+                pose[2, 2] = cos_yaw
+                # 平移（在旋转后的局部坐标系中前进）
+                pose[2, 3] = -forward_speed  # 局部Z轴负方向（前进）
+                # 添加轻微的向心运动，模拟圆形轨迹
+                radius_drift = 0.000  # 向圆心的轻微漂移
+                pose[0, 3] = radius_drift  # 局部X轴负方向（向左）
+                relative_pose = pose[:3, :]
+                relative_poses.append(torch.as_tensor(relative_pose))
+            pose_embedding = torch.stack(relative_poses, dim=0)
+            pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+            # 创建对应长度的mask序列
+            mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+            condition_end = min(start_frame + current_history_length, max_needed_frames)
+            mask[start_frame:condition_end] = 1.0
+            camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+            print(f"🔧 Sekai合成camera embedding shape: {camera_embedding.shape}")
+            return camera_embedding.to(torch.bfloat16)
+def generate_openx_camera_embeddings_sliding(encoded_data, start_frame, current_history_length, new_frames, use_real_poses):
+    """为OpenX数据集生成camera embeddings - 滑动窗口版本"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera帧数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if use_real_poses and encoded_data is not None and 'cam_emb' in encoded_data and 'extrinsic' in encoded_data['cam_emb']:
+        print("🔧 使用OpenX真实camera数据")
+        cam_extrinsic = encoded_data['cam_emb']['extrinsic']
+        # 确保生成足够长的camera序列
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 计算OpenX camera序列长度:")
+        print(f"  - 基础需求: {start_frame + current_history_length + new_frames}")
+        print(f"  - FramePack需求: {framepack_needed_frames}")
+        print(f"  - 最终生成: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # OpenX使用4倍间隔，类似sekai但处理更短的序列
+            frame_idx = i * time_compression_ratio
+            next_frame_idx = frame_idx + time_compression_ratio
+            if next_frame_idx < len(cam_extrinsic):
+                cam_prev = cam_extrinsic[frame_idx]
+                cam_next = cam_extrinsic[next_frame_idx]
+                relative_pose = compute_relative_pose(cam_prev, cam_next)
+                relative_poses.append(torch.as_tensor(relative_pose[:3, :]))
+            else:
+                # 超出范围，使用零运动
+                print(f"⚠️ 帧{frame_idx}超出OpenX camera数据范围，使用零运动")
+                relative_poses.append(torch.zeros(3, 4))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        # 从start_frame到current_history_length标记为condition
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 OpenX真实camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用OpenX合成camera数据")
+        max_needed_frames = max(
+            start_frame + current_history_length + new_frames,
+            framepack_needed_frames,
+            30
+        )
+        print(f"🔧 生成OpenX合成camera帧数: {max_needed_frames}")
+        relative_poses = []
+        for i in range(max_needed_frames):
+            # OpenX机器人操作运动模式 - 较小的运动幅度
+            # 模拟机器人手臂的精细操作运动
+            roll_per_frame = 0.02   # 轻微翻滚
+            pitch_per_frame = 0.01  # 轻微俯仰
+            yaw_per_frame = 0.015   # 轻微偏航
+            forward_speed = 0.003   # 较慢的前进速度
+            pose = np.eye(4, dtype=np.float32)
+            # 复合旋转 - 模拟机器人手臂的复杂运动
+            # 绕X轴旋转（roll）
+            cos_roll = np.cos(roll_per_frame)
+            sin_roll = np.sin(roll_per_frame)
+            # 绕Y轴旋转（pitch）
+            cos_pitch = np.cos(pitch_per_frame)
+            sin_pitch = np.sin(pitch_per_frame)
+            # 绕Z轴旋转（yaw）
+            cos_yaw = np.cos(yaw_per_frame)
+            sin_yaw = np.sin(yaw_per_frame)
+            # 简化的复合旋转矩阵（ZYX顺序）
+            pose[0, 0] = cos_yaw * cos_pitch
+            pose[0, 1] = cos_yaw * sin_pitch * sin_roll - sin_yaw * cos_roll
+            pose[0, 2] = cos_yaw * sin_pitch * cos_roll + sin_yaw * sin_roll
+            pose[1, 0] = sin_yaw * cos_pitch
+            pose[1, 1] = sin_yaw * sin_pitch * sin_roll + cos_yaw * cos_roll
+            pose[1, 2] = sin_yaw * sin_pitch * cos_roll - cos_yaw * sin_roll
+            pose[2, 0] = -sin_pitch
+            pose[2, 1] = cos_pitch * sin_roll
+            pose[2, 2] = cos_pitch * cos_roll
+            # 平移 - 模拟机器人操作的精细移动
+            pose[0, 3] = forward_speed * 0.5   # X方向轻微移动
+            pose[1, 3] = forward_speed * 0.3   # Y方向轻微移动
+            pose[2, 3] = -forward_speed        # Z方向（深度）主要移动
+            relative_pose = pose[:3, :]
+            relative_poses.append(torch.as_tensor(relative_pose))
+        pose_embedding = torch.stack(relative_poses, dim=0)
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+        # 创建对应长度的mask序列
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_embedding, mask], dim=1)
+        print(f"🔧 OpenX合成camera embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def generate_nuscenes_camera_embeddings_sliding(scene_info, start_frame, current_history_length, new_frames):
+    """为NuScenes数据集生成camera embeddings - 滑动窗口版本 - 修正版，与train_moe.py保持一致"""
+    time_compression_ratio = 4
+    # 计算FramePack实际需要的camera��数
+    framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames
+    if scene_info is not None and 'keyframe_poses' in scene_info:
+        print("🔧 使用NuScenes真实pose数据")
+        keyframe_poses = scene_info['keyframe_poses']
+        if len(keyframe_poses) == 0:
+            print("⚠️ NuScenes keyframe_poses为空，使用零pose")
+            max_needed_frames = max(framepack_needed_frames, 30)
+            pose_sequence = torch.zeros(max_needed_frames, 7, dtype=torch.float32)
+            mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+            condition_end = min(start_frame + current_history_length, max_needed_frames)
+            mask[start_frame:condition_end] = 1.0
+            camera_embedding = torch.cat([pose_sequence, mask], dim=1)  # [max_needed_frames, 8]
+            print(f"🔧 NuScenes零pose embedding shape: {camera_embedding.shape}")
+            return camera_embedding.to(torch.bfloat16)
+        # 使用第一个pose作为参考
+        reference_pose = keyframe_poses[0]
+        max_needed_frames = max(framepack_needed_frames, 30)
+        pose_vecs = []
+        for i in range(max_needed_frames):
+            if i < len(keyframe_poses):
+                current_pose = keyframe_poses[i]
+                # 计算相对位移
+                translation = torch.tensor(
+                    np.array(current_pose['translation']) - np.array(reference_pose['translation']),
+                    dtype=torch.float32
+                )
+                # 计算相对旋转（简化版本）
+                rotation = torch.tensor(current_pose['rotation'], dtype=torch.float32)
+                pose_vec = torch.cat([translation, rotation], dim=0)  # [7D]
+            else:
+                # 超出范围，使用零pose
+                pose_vec = torch.cat([
+                    torch.zeros(3, dtype=torch.float32),
+                    torch.tensor([1.0, 0.0, 0.0, 0.0], dtype=torch.float32)
+                ], dim=0)  # [7D]
+            pose_vecs.append(pose_vec)
+        pose_sequence = torch.stack(pose_vecs, dim=0)  # [max_needed_frames, 7]
+        # 创建mask
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_sequence, mask], dim=1)  # [max_needed_frames, 8]
+        print(f"🔧 NuScenes真实pose embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+    else:
+        print("🔧 使用NuScenes合成pose数据")
+        max_needed_frames = max(framepack_needed_frames, 30)
+        # 创建合成运动序列
+        pose_vecs = []
+        for i in range(max_needed_frames):
+            # 左转运动模式 - 类似城市驾驶中的左转弯
+            angle = i * 0.04  # 每帧转动0.08弧度（稍微慢一点的转弯）
+            radius = 15.0     # 较大的转弯半径，更符合汽车转弯
+            # 计算圆弧轨迹上的位置
+            x = radius * np.sin(angle)
+            y = 0.0  # 保持水平面运动
+            z = radius * (1 - np.cos(angle))
+            translation = torch.tensor([x, y, z], dtype=torch.float32)
+            # 车辆朝向 - 始终沿着轨迹切线方向
+            yaw = angle + np.pi/2  # 相对于初始前进方向的偏航角
+            # 四元数表示绕Y轴的旋转
+            rotation = torch.tensor([
+                np.cos(yaw/2),  # w (实部)
+                0.0,            # x
+                0.0,            # y
+                np.sin(yaw/2)   # z (虚部，绕Y轴)
+            ], dtype=torch.float32)
+            pose_vec = torch.cat([translation, rotation], dim=0)  # [7D: tx,ty,tz,qw,qx,qy,qz]
+            pose_vecs.append(pose_vec)
+        pose_sequence = torch.stack(pose_vecs, dim=0)
+        # 创建mask
+        mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
+        condition_end = min(start_frame + current_history_length, max_needed_frames)
+        mask[start_frame:condition_end] = 1.0
+        camera_embedding = torch.cat([pose_sequence, mask], dim=1)  # [max_needed_frames, 8]
+        print(f"🔧 NuScenes合成左转pose embedding shape: {camera_embedding.shape}")
+        return camera_embedding.to(torch.bfloat16)
+def prepare_framepack_sliding_window_with_camera_moe(history_latents, target_frames_to_generate, camera_embedding_full, start_frame, modality_type, max_history_frames=49):
+    """FramePack滑动窗口机制 - MoE版本"""
+    # history_latents: [C, T, H, W] 当前的历史latents
+    C, T, H, W = history_latents.shape
+    # 固定索引结构（这决定了需要的camera帧数）
+    total_indices_length = 1 + 16 + 2 + 1 + target_frames_to_generate
+    indices = torch.arange(0, total_indices_length)
+    split_sizes = [1, 16, 2, 1, target_frames_to_generate]
+    clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = \
+        indices.split(split_sizes, dim=0)
+    clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=0)
+    # 检查camera长度是否足够
+    if camera_embedding_full.shape[0] < total_indices_length:
+        shortage = total_indices_length - camera_embedding_full.shape[0]
+        padding = torch.zeros(shortage, camera_embedding_full.shape[1],
+                            dtype=camera_embedding_full.dtype, device=camera_embedding_full.device)
+        camera_embedding_full = torch.cat([camera_embedding_full, padding], dim=0)
+    # 从完整camera序列中选取对应部分
+    combined_camera = camera_embedding_full[:total_indices_length, :].clone()
+    # 根据当前history length重新设置mask
+    combined_camera[:, -1] = 0.0  # 先全部设为target (0)
+    # 设置condition mask：前19帧根据实际历史长度决定
+    if T > 0:
+        available_frames = min(T, 19)
+        start_pos = 19 - available_frames
+        combined_camera[start_pos:19, -1] = 1.0  # 将有效的clean latents对应的camera标记为condition
+    print(f"🔧 MoE Camera mask更新:")
+    print(f"  - 历史帧数: {T}")
+    print(f"  - 有效condition帧数: {available_frames if T > 0 else 0}")
+    print(f"  - 模态类型: {modality_type}")
+    # 处理latents
+    clean_latents_combined = torch.zeros(C, 19, H, W, dtype=history_latents.dtype, device=history_latents.device)
+    if T > 0:
+        available_frames = min(T, 19)
+        start_pos = 19 - available_frames
+        clean_latents_combined[:, start_pos:, :, :] = history_latents[:, -available_frames:, :, :]
+    clean_latents_4x = clean_latents_combined[:, 0:16, :, :]
+    clean_latents_2x = clean_latents_combined[:, 16:18, :, :]
+    clean_latents_1x = clean_latents_combined[:, 18:19, :, :]
+    if T > 0:
+        start_latent = history_latents[:, 0:1, :, :]
+    else:
+        start_latent = torch.zeros(C, 1, H, W, dtype=history_latents.dtype, device=history_latents.device)
+    clean_latents = torch.cat([start_latent, clean_latents_1x], dim=1)
+    return {
+        'latent_indices': latent_indices,
+        'clean_latents': clean_latents,
+        'clean_latents_2x': clean_latents_2x,
+        'clean_latents_4x': clean_latents_4x,
+        'clean_latent_indices': clean_latent_indices,
+        'clean_latent_2x_indices': clean_latent_2x_indices,
+        'clean_latent_4x_indices': clean_latent_4x_indices,
+        'camera_embedding': combined_camera,
+        'modality_type': modality_type,  # 新增模态类型信息
+        'current_length': T,
+        'next_length': T + target_frames_to_generate
+    }
+def inference_moe_framepack_sliding_window(
+    condition_pth_path,
+    dit_path,
+    output_path="moe/infer_results/output_moe_framepack_sliding.mp4",
+    start_frame=0,
+    initial_condition_frames=8,
+    frames_per_generation=4,
+    total_frames_to_generate=32,
+    max_history_frames=49,
+    device="cuda",
+    prompt="A video of a scene shot using a pedestrian's front camera while walking",
+    modality_type="sekai",  # "sekai" 或 "nuscenes"
+    use_real_poses=True,
+    scene_info_path=None,  # 对于NuScenes数据集
+    # CFG参数
+    use_camera_cfg=True,
+    camera_guidance_scale=2.0,
+    text_guidance_scale=1.0,
+    # MoE参数
+    moe_num_experts=4,
+    moe_top_k=2,
+    moe_hidden_dim=None,
+    direction="left",
+    use_gt_prompt=True
+):
+    """
+    MoE FramePack滑动窗口视频生成 - 支持多模态
+    """
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    print(f"🔧 MoE FramePack滑动窗口生成开始...")
+    print(f"模态类型: {modality_type}")
+    print(f"Camera CFG: {use_camera_cfg}, Camera guidance scale: {camera_guidance_scale}")
+    print(f"Text guidance scale: {text_guidance_scale}")
+    print(f"MoE配置: experts={moe_num_experts}, top_k={moe_top_k}")
+    # 1. 模型初始化
+    replace_dit_model_in_manager()
+    model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+    model_manager.load_models([
+        "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+    ])
+    pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager, device="cuda")
+    # 2. 添加传统camera编码器（兼容性）
+    dim = pipe.dit.blocks[0].self_attn.q.weight.shape[0]
+    for block in pipe.dit.blocks:
+        block.cam_encoder = nn.Linear(13, dim)
+        block.projector = nn.Linear(dim, dim)
+        block.cam_encoder.weight.data.zero_()
+        block.cam_encoder.bias.data.zero_()
+        block.projector.weight = nn.Parameter(torch.eye(dim))
+        block.projector.bias = nn.Parameter(torch.zeros(dim))
+    # 3. 添加FramePack组件
+    add_framepack_components(pipe.dit)
+    # 4. 添加MoE组件
+    moe_config = {
+        "num_experts": moe_num_experts,
+        "top_k": moe_top_k,
+        "hidden_dim": moe_hidden_dim or dim * 2,
+        "sekai_input_dim": 13,    # Sekai: 12维pose + 1维mask
+        "nuscenes_input_dim": 8,   # NuScenes: 7维pose + 1维mask
+        "openx_input_dim": 13       # OpenX: 12维pose + 1维mask (类似sekai)
+    }
+    add_moe_components(pipe.dit, moe_config)
+    # 5. 加载训练好的权重
+    dit_state_dict = torch.load(dit_path, map_location="cpu")
+    pipe.dit.load_state_dict(dit_state_dict, strict=False)  # 使用strict=False以兼容新增的MoE组件
+    pipe = pipe.to(device)
+    model_dtype = next(pipe.dit.parameters()).dtype
+    if hasattr(pipe.dit, 'clean_x_embedder'):
+        pipe.dit.clean_x_embedder = pipe.dit.clean_x_embedder.to(dtype=model_dtype)
+    pipe.scheduler.set_timesteps(50)
+    # 6. 加载初始条件
+    print("Loading initial condition frames...")
+    initial_latents, encoded_data = load_encoded_video_from_pth(
+        condition_pth_path,
+        start_frame=start_frame,
+        num_frames=initial_condition_frames
+    )
+    # 空间裁剪
+    target_height, target_width = 60, 104
+    C, T, H, W = initial_latents.shape
+    if H > target_height or W > target_width:
+        h_start = (H - target_height) // 2
+        w_start = (W - target_width) // 2
+        initial_latents = initial_latents[:, :, h_start:h_start+target_height, w_start:w_start+target_width]
+        H, W = target_height, target_width
+    history_latents = initial_latents.to(device, dtype=model_dtype)
+    print(f"初始history_latents shape: {history_latents.shape}")
+    # 7. 编码prompt - 支持CFG
+    if use_gt_prompt and 'prompt_emb' in encoded_data:
+        print("✅ 使用预编码的GT prompt embedding")
+        prompt_emb_pos = encoded_data['prompt_emb']
+        # 将prompt_emb移到正确的设备和数据类型
+        if 'context' in prompt_emb_pos:
+            prompt_emb_pos['context'] = prompt_emb_pos['context'].to(device, dtype=model_dtype)
+        if 'context_mask' in prompt_emb_pos:
+            prompt_emb_pos['context_mask'] = prompt_emb_pos['context_mask'].to(device, dtype=model_dtype)
+        # 如果使用Text CFG，生成负向prompt
+        if text_guidance_scale > 1.0:
+            prompt_emb_neg = pipe.encode_prompt("")
+            print(f"使用Text CFG with GT prompt，guidance scale: {text_guidance_scale}")
+        else:
+            prompt_emb_neg = None
+            print("不使用Text CFG")
+        # 🔧 打印GT prompt文本（如果有）
+        if 'prompt' in encoded_data['prompt_emb']:
+            gt_prompt_text = encoded_data['prompt_emb']['prompt']
+            print(f"📝 GT Prompt文本: {gt_prompt_text}")
+    else:
+        # 使用传入的prompt参数重新编码
+        print(f"🔄 重新编码prompt: {prompt}")
+        if text_guidance_scale > 1.0:
+            prompt_emb_pos = pipe.encode_prompt(prompt)
+            prompt_emb_neg = pipe.encode_prompt("")
+            print(f"使用Text CFG，guidance scale: {text_guidance_scale}")
+        else:
+            prompt_emb_pos = pipe.encode_prompt(prompt)
+            prompt_emb_neg = None
+            print("不使用Text CFG")
+    # 8. 加载场景信息（对于NuScenes）
+    scene_info = None
+    if modality_type == "nuscenes" and scene_info_path and os.path.exists(scene_info_path):
+        with open(scene_info_path, 'r') as f:
+            scene_info = json.load(f)
+        print(f"加载NuScenes场景信息: {scene_info_path}")
+    # 9. 预生成完整的camera embedding序列
+    if modality_type == "sekai":
+        camera_embedding_full = generate_sekai_camera_embeddings_sliding(
+            encoded_data.get('cam_emb', None),
+            0,
+            max_history_frames,
+            0,
+            0,
+            use_real_poses=use_real_poses,
+            direction=direction
+        ).to(device, dtype=model_dtype)
+    elif modality_type == "nuscenes":
+        camera_embedding_full = generate_nuscenes_camera_embeddings_sliding(
+            scene_info,
+            0,
+            max_history_frames,
+            0
+        ).to(device, dtype=model_dtype)
+    elif modality_type == "openx":
+        camera_embedding_full = generate_openx_camera_embeddings_sliding(
+            encoded_data,
+            0,
+            max_history_frames,
+            0,
+            use_real_poses=use_real_poses
+        ).to(device, dtype=model_dtype)
+    else:
+        raise ValueError(f"不支持的模态类型: {modality_type}")
+    print(f"完整camera序列shape: {camera_embedding_full.shape}")
+    # 10. 为Camera CFG创建无条件的camera embedding
+    if use_camera_cfg:
+        camera_embedding_uncond = torch.zeros_like(camera_embedding_full)
+        print(f"创建无条件camera embedding用于CFG")
+    # 11. 滑动窗口生成循环
+    total_generated = 0
+    all_generated_frames = []
+    while total_generated < total_frames_to_generate:
+        current_generation = min(frames_per_generation, total_frames_to_generate - total_generated)
+        print(f"\n🔧 生成步骤 {total_generated // frames_per_generation + 1}")
+        print(f"当前历史长度: {history_latents.shape[1]}, 本次生成: {current_generation}")
+        # FramePack数据准备 - MoE版本
+        framepack_data = prepare_framepack_sliding_window_with_camera_moe(
+            history_latents,
+            current_generation,
+            camera_embedding_full,
+            start_frame,
+            modality_type,
+            max_history_frames
+        )
+        # 准备输入
+        clean_latents = framepack_data['clean_latents'].unsqueeze(0)
+        clean_latents_2x = framepack_data['clean_latents_2x'].unsqueeze(0)
+        clean_latents_4x = framepack_data['clean_latents_4x'].unsqueeze(0)
+        camera_embedding = framepack_data['camera_embedding'].unsqueeze(0)
+        # 准备modality_inputs
+        modality_inputs = {modality_type: camera_embedding}
+        # 为CFG准备无条件camera embedding
+        if use_camera_cfg:
+            camera_embedding_uncond_batch = camera_embedding_uncond[:camera_embedding.shape[1], :].unsqueeze(0)
+            modality_inputs_uncond = {modality_type: camera_embedding_uncond_batch}
+        # 索引处理
+        latent_indices = framepack_data['latent_indices'].unsqueeze(0).cpu()
+        clean_latent_indices = framepack_data['clean_latent_indices'].unsqueeze(0).cpu()
+        clean_latent_2x_indices = framepack_data['clean_latent_2x_indices'].unsqueeze(0).cpu()
+        clean_latent_4x_indices = framepack_data['clean_latent_4x_indices'].unsqueeze(0).cpu()
+        # 初始化要生成的latents
+        new_latents = torch.randn(
+            1, C, current_generation, H, W,
+            device=device, dtype=model_dtype
+        )
+        extra_input = pipe.prepare_extra_input(new_latents)
+        print(f"Camera embedding shape: {camera_embedding.shape}")
+        print(f"Camera mask分布 - condition: {torch.sum(camera_embedding[0, :, -1] == 1.0).item()}, target: {torch.sum(camera_embedding[0, :, -1] == 0.0).item()}")
+        # 去噪循环 - 支持CFG
+        timesteps = pipe.scheduler.timesteps
+        for i, timestep in enumerate(timesteps):
+            if i % 10 == 0:
+                print(f"  去噪步骤 {i+1}/{len(timesteps)}")
+            timestep_tensor = timestep.unsqueeze(0).to(device, dtype=model_dtype)
+            with torch.no_grad():
+                # CFG推理
+                if use_camera_cfg and camera_guidance_scale > 1.0:
+                    # 条件预测（有camera）
+                    noise_pred_cond, moe_loess = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,  # MoE模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+                    # 无条件预测（无camera）
+                    noise_pred_uncond, moe_loess = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding_uncond_batch,
+                        modality_inputs=modality_inputs_uncond,  # MoE无条件模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **(prompt_emb_neg if prompt_emb_neg else prompt_emb_pos),
+                        **extra_input
+                    )
+                    # Camera CFG
+                    noise_pred = noise_pred_uncond + camera_guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    # 如果同时使用Text CFG
+                    if text_guidance_scale > 1.0 and prompt_emb_neg:
+                        noise_pred_text_uncond, moe_loess = pipe.dit(
+                            new_latents,
+                            timestep=timestep_tensor,
+                            cam_emb=camera_embedding,
+                            modality_inputs=modality_inputs,
+                            latent_indices=latent_indices,
+                            clean_latents=clean_latents,
+                            clean_latent_indices=clean_latent_indices,
+                            clean_latents_2x=clean_latents_2x,
+                            clean_latent_2x_indices=clean_latent_2x_indices,
+                            clean_latents_4x=clean_latents_4x,
+                            clean_latent_4x_indices=clean_latent_4x_indices,
+                            **prompt_emb_neg,
+                            **extra_input
+                        )
+                        # 应用Text CFG到已经应用Camera CFG的结果
+                        noise_pred = noise_pred_text_uncond + text_guidance_scale * (noise_pred - noise_pred_text_uncond)
+                elif text_guidance_scale > 1.0 and prompt_emb_neg:
+                    # 只使用Text CFG
+                    noise_pred_cond, moe_loess = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+                    noise_pred_uncond, moe_loess= pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_neg,
+                        **extra_input
+                    )
+                    noise_pred = noise_pred_uncond + text_guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                else:
+                    # 标准推理（无CFG）
+                    noise_pred, moe_loess = pipe.dit(
+                        new_latents,
+                        timestep=timestep_tensor,
+                        cam_emb=camera_embedding,
+                        modality_inputs=modality_inputs,  # MoE模态输入
+                        latent_indices=latent_indices,
+                        clean_latents=clean_latents,
+                        clean_latent_indices=clean_latent_indices,
+                        clean_latents_2x=clean_latents_2x,
+                        clean_latent_2x_indices=clean_latent_2x_indices,
+                        clean_latents_4x=clean_latents_4x,
+                        clean_latent_4x_indices=clean_latent_4x_indices,
+                        **prompt_emb_pos,
+                        **extra_input
+                    )
+            new_latents = pipe.scheduler.step(noise_pred, timestep, new_latents)
+        # 更新历史
+        new_latents_squeezed = new_latents.squeeze(0)
+        history_latents = torch.cat([history_latents, new_latents_squeezed], dim=1)
+        # 维护滑动窗口
+        if history_latents.shape[1] > max_history_frames:
+            first_frame = history_latents[:, 0:1, :, :]
+            recent_frames = history_latents[:, -(max_history_frames-1):, :, :]
+            history_latents = torch.cat([first_frame, recent_frames], dim=1)
+            print(f"历史窗口已满，保留第一帧+最新{max_history_frames-1}帧")
+        print(f"更新后history_latents shape: {history_latents.shape}")
+        all_generated_frames.append(new_latents_squeezed)
+        total_generated += current_generation
+        print(f"✅ 已生成 {total_generated}/{total_frames_to_generate} 帧")
+    # 12. 解码和保存
+    print("\n🔧 解码生成的视频...")
+    all_generated = torch.cat(all_generated_frames, dim=1)
+    final_video = torch.cat([initial_latents.to(all_generated.device), all_generated], dim=1).unsqueeze(0)
+    print(f"最终视频shape: {final_video.shape}")
+    decoded_video = pipe.decode_video(final_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16))
+    print(f"Saving video to {output_path}")
+    video_np = decoded_video[0].to(torch.float32).permute(1, 2, 3, 0).cpu().numpy()
+    video_np = (video_np * 0.5 + 0.5).clip(0, 1)
+    video_np = (video_np * 255).astype(np.uint8)
+    with imageio.get_writer(output_path, fps=20) as writer:
+        for frame in video_np:
+            writer.append_data(frame)
+    print(f"🔧 MoE FramePack滑动窗口生成完成! 保存到: {output_path}")
+    print(f"总共生成了 {total_generated} 帧 (压缩后), 对应原始 {total_generated * 4} 帧")
+    print(f"使用模态: {modality_type}")
+def main():
+    parser = argparse.ArgumentParser(description="MoE FramePack滑动窗口视频生成 - 支持多模态")
+    # 基础参数
+    parser.add_argument("--condition_pth", type=str,
+                       #default="/share_zhuyixuan05/zhuyixuan05/sekai-game-drone/00500210001_0012150_0012450/encoded_video.pth")
+                       default="/share_zhuyixuan05/zhuyixuan05/nuscenes_video_generation_dynamic/scenes/scene-0001_CAM_FRONT/encoded_video-480p.pth")
+                       #default="/share_zhuyixuan05/zhuyixuan05/spatialvid/a9a6d37f-0a6c-548a-a494-7d902469f3f2_0000000_0000300/encoded_video.pth")
+                       #default="/share_zhuyixuan05/zhuyixuan05/openx-fractal-encoded/episode_000001/encoded_video.pth")
+    parser.add_argument("--start_frame", type=int, default=0)
+    parser.add_argument("--initial_condition_frames", type=int, default=16)
+    parser.add_argument("--frames_per_generation", type=int, default=8)
+    parser.add_argument("--total_frames_to_generate", type=int, default=24)
+    parser.add_argument("--max_history_frames", type=int, default=100)
+    parser.add_argument("--use_real_poses", default=False)
+    parser.add_argument("--dit_path", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/ICLR2026/framepack_moe/step175000_origin_other_continue3.ckpt")
+    parser.add_argument("--output_path", type=str,
+                       default='/home/zhuyixuan05/ReCamMaster/moe/infer_results/output_moe_framepack_sliding.mp4')
+    parser.add_argument("--prompt", type=str,
+                       default="A car is driving")
+    parser.add_argument("--device", type=str, default="cuda")
+    # 模态类型参数
+    parser.add_argument("--modality_type", type=str, choices=["sekai", "nuscenes", "openx"], default="nuscenes",
+                       help="模态类型：sekai 或 nuscenes 或 openx")
+    parser.add_argument("--scene_info_path", type=str, default=None,
+                       help="NuScenes场景信息文件路径（仅用于nuscenes模态）")
+    # CFG参数
+    parser.add_argument("--use_camera_cfg", default=False,
+                       help="使用Camera CFG")
+    parser.add_argument("--camera_guidance_scale", type=float, default=2.0,
+                       help="Camera guidance scale for CFG")
+    parser.add_argument("--text_guidance_scale", type=float, default=1.0,
+                       help="Text guidance scale for CFG")
+    # MoE参数
+    parser.add_argument("--moe_num_experts", type=int, default=3, help="专家数量")
+    parser.add_argument("--moe_top_k", type=int, default=1, help="Top-K专家")
+    parser.add_argument("--moe_hidden_dim", type=int, default=None, help="MoE隐藏层维度")
+    parser.add_argument("--direction", type=str, default="left")
+    parser.add_argument("--use_gt_prompt", action="store_true", default=False,
+                       help="使用数据集中的ground truth prompt embedding")
+    args = parser.parse_args()
+    print(f"🔧 MoE FramePack CFG生成设置:")
+    print(f"模态类型: {args.modality_type}")
+    print(f"Camera CFG: {args.use_camera_cfg}")
+    if args.use_camera_cfg:
+        print(f"Camera guidance scale: {args.camera_guidance_scale}")
+    print(f"使用GT Prompt: {args.use_gt_prompt}")
+    print(f"Text guidance scale: {args.text_guidance_scale}")
+    print(f"MoE配置: experts={args.moe_num_experts}, top_k={args.moe_top_k}")
+    print(f"DiT{args.dit_path}")
+    # 验证NuScenes参数
+    if args.modality_type == "nuscenes" and not args.scene_info_path:
+        print("⚠️ 使用NuScenes模态但未提供scene_info_path，将使用合成pose数据")
+    inference_moe_framepack_sliding_window(
+        condition_pth_path=args.condition_pth,
+        dit_path=args.dit_path,
+        output_path=args.output_path,
+        start_frame=args.start_frame,
+        initial_condition_frames=args.initial_condition_frames,
+        frames_per_generation=args.frames_per_generation,
+        total_frames_to_generate=args.total_frames_to_generate,
+        max_history_frames=args.max_history_frames,
+        device=args.device,
+        prompt=args.prompt,
+        modality_type=args.modality_type,
+        use_real_poses=args.use_real_poses,
+        scene_info_path=args.scene_info_path,
+        # CFG参数
+        use_camera_cfg=args.use_camera_cfg,
+        camera_guidance_scale=args.camera_guidance_scale,
+        text_guidance_scale=args.text_guidance_scale,
+        # MoE参数
+        moe_num_experts=args.moe_num_experts,
+        moe_top_k=args.moe_top_k,
+        moe_hidden_dim=args.moe_hidden_dim,
+        direction=args.direction,
+        use_gt_prompt=args.use_gt_prompt
+    )
+if __name__ == "__main__":
+    main()

scripts/infer_recam.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import sys
+import torch
+import torch.nn as nn
+from diffsynth import ModelManager, WanVideoReCamMasterPipeline, save_video, VideoData
+import torch, os, imageio, argparse
+from torchvision.transforms import v2
+from einops import rearrange
+import pandas as pd
+import torchvision
+from PIL import Image
+import numpy as np
+import json
+class Camera(object):
+    def __init__(self, c2w):
+        c2w_mat = np.array(c2w).reshape(4, 4)
+        self.c2w_mat = c2w_mat
+        self.w2c_mat = np.linalg.inv(c2w_mat)
+class TextVideoCameraDataset(torch.utils.data.Dataset):
+    def __init__(self, base_path, metadata_path, args, max_num_frames=81, frame_interval=1, num_frames=81, height=480, width=832, is_i2v=False, condition_frames=40, target_frames=20):
+        metadata = pd.read_csv(metadata_path)
+        self.path = [os.path.join(base_path, "videos", file_name) for file_name in metadata["file_name"]]
+        self.text = metadata["text"].to_list()
+        self.max_num_frames = max_num_frames
+        self.frame_interval = frame_interval
+        self.num_frames = num_frames
+        self.height = height
+        self.width = width
+        self.is_i2v = is_i2v
+        self.args = args
+        self.cam_type = self.args.cam_type
+        # 🔧 新增：保存帧数配置
+        self.condition_frames = condition_frames
+        self.target_frames = target_frames
+        self.frame_process = v2.Compose([
+            v2.CenterCrop(size=(height, width)),
+            v2.Resize(size=(height, width), antialias=True),
+            v2.ToTensor(),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+    def crop_and_resize(self, image):
+        width, height = image.size
+        scale = max(self.width / width, self.height / height)
+        image = torchvision.transforms.functional.resize(
+            image,
+            (round(height*scale), round(width*scale)),
+            interpolation=torchvision.transforms.InterpolationMode.BILINEAR
+        )
+        return image
+    def load_frames_using_imageio(self, file_path, max_num_frames, start_frame_id, interval, num_frames, frame_process):
+        reader = imageio.get_reader(file_path)
+        if reader.count_frames() < max_num_frames or reader.count_frames() - 1 < start_frame_id + (num_frames - 1) * interval:
+            reader.close()
+            return None
+        frames = []
+        first_frame = None
+        for frame_id in range(num_frames):
+            frame = reader.get_data(start_frame_id + frame_id * interval)
+            frame = Image.fromarray(frame)
+            frame = self.crop_and_resize(frame)
+            if first_frame is None:
+                first_frame = np.array(frame)
+            frame = frame_process(frame)
+            frames.append(frame)
+        reader.close()
+        frames = torch.stack(frames, dim=0)
+        frames = rearrange(frames, "T C H W -> C T H W")
+        if self.is_i2v:
+            return frames, first_frame
+        else:
+            return frames
+    def is_image(self, file_path):
+        file_ext_name = file_path.split(".")[-1]
+        if file_ext_name.lower() in ["jpg", "jpeg", "png", "webp"]:
+            return True
+        return False
+    def load_video(self, file_path):
+        start_frame_id = torch.randint(0, self.max_num_frames - (self.num_frames - 1) * self.frame_interval, (1,))[0]
+        frames = self.load_frames_using_imageio(file_path, self.max_num_frames, start_frame_id, self.frame_interval, self.num_frames, self.frame_process)
+        return frames
+    def parse_matrix(self, matrix_str):
+        rows = matrix_str.strip().split('] [')
+        matrix = []
+        for row in rows:
+            row = row.replace('[', '').replace(']', '')
+            matrix.append(list(map(float, row.split())))
+        return np.array(matrix)
+    def get_relative_pose(self, cam_params):
+        abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
+        abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
+        cam_to_origin = 0
+        target_cam_c2w = np.array([
+            [1, 0, 0, 0],
+            [0, 1, 0, -cam_to_origin],
+            [0, 0, 1, 0],
+            [0, 0, 0, 1]
+        ])
+        abs2rel = target_cam_c2w @ abs_w2cs[0]
+        ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
+        ret_poses = np.array(ret_poses, dtype=np.float32)
+        return ret_poses
+    def __getitem__(self, data_id):
+        text = self.text[data_id]
+        path = self.path[data_id]
+        video = self.load_video(path)
+        if video is None:
+            raise ValueError(f"{path} is not a valid video.")
+        num_frames = video.shape[1]
+        assert num_frames == 81
+        data = {"text": text, "video": video, "path": path}
+        # load camera
+        tgt_camera_path = "./example_test_data/cameras/camera_extrinsics.json"
+        with open(tgt_camera_path, 'r') as file:
+            cam_data = json.load(file)
+        # 🔧 修改：生成target_frames长度的相机轨迹
+        cam_idx = np.linspace(0, 80, self.target_frames, dtype=int).tolist()  # 改为target_frames长度
+        traj = [self.parse_matrix(cam_data[f"frame{idx}"][f"cam{int(self.cam_type):02d}"]) for idx in cam_idx]
+        traj = np.stack(traj).transpose(0, 2, 1)
+        c2ws = []
+        for c2w in traj:
+            c2w = c2w[:, [1, 2, 0, 3]]
+            c2w[:3, 1] *= -1.
+            c2w[:3, 3] /= 100
+            c2ws.append(c2w)
+        tgt_cam_params = [Camera(cam_param) for cam_param in c2ws]
+        relative_poses = []
+        for i in range(len(tgt_cam_params)):
+            relative_pose = self.get_relative_pose([tgt_cam_params[0], tgt_cam_params[i]])
+            relative_poses.append(torch.as_tensor(relative_pose)[:,:3,:][1])
+        pose_embedding = torch.stack(relative_poses, dim=0)  # [target_frames, 3, 4]
+        pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')  # [target_frames, 12]
+        data['camera'] = pose_embedding.to(torch.bfloat16)
+        return data
+    def __len__(self):
+        return len(self.path)
+def parse_args():
+    parser = argparse.ArgumentParser(description="ReCamMaster Inference")
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        default="./example_test_data",
+        help="The path of the Dataset.",
+    )
+    parser.add_argument(
+        "--ckpt_path",
+        type=str,
+        default="/share_zhuyixuan05/zhuyixuan05/recam_future_checkpoint/step1000.ckpt",
+        help="Path to save the model.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./results",
+        help="Path to save the results.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=1,
+        help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.",
+    )
+    parser.add_argument(
+        "--cam_type",
+        type=str,
+        default=1,
+    )
+    parser.add_argument(
+        "--cfg_scale",
+        type=float,
+        default=5.0,
+    )
+    # 🔧 新增：condition和target帧数参数
+    parser.add_argument(
+        "--condition_frames",
+        type=int,
+        default=15,
+        help="Number of condition frames",
+    )
+    parser.add_argument(
+        "--target_frames",
+        type=int,
+        default=15,
+        help="Number of target frames to generate",
+    )
+    args = parser.parse_args()
+    return args
+if __name__ == '__main__':
+    args = parse_args()
+    # 1. Load Wan2.1 pre-trained models
+    model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+    model_manager.load_models([
+        "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+    ])
+    pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager, device="cuda")
+    # 2. Initialize additional modules introduced in ReCamMaster
+    dim=pipe.dit.blocks[0].self_attn.q.weight.shape[0]
+    for block in pipe.dit.blocks:
+        block.cam_encoder = nn.Linear(12, dim)
+        block.projector = nn.Linear(dim, dim)
+        block.cam_encoder.weight.data.zero_()
+        block.cam_encoder.bias.data.zero_()
+        block.projector.weight = nn.Parameter(torch.eye(dim))
+        block.projector.bias = nn.Parameter(torch.zeros(dim))
+    # 3. Load ReCamMaster checkpoint
+    state_dict = torch.load(args.ckpt_path, map_location="cpu")
+    pipe.dit.load_state_dict(state_dict, strict=True)
+    pipe.to("cuda")
+    pipe.to(dtype=torch.bfloat16)
+    output_dir = os.path.join(args.output_dir, f"cam_type{args.cam_type}")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # 4. Prepare test data (source video, target camera, target trajectory)
+    dataset = TextVideoCameraDataset(
+        args.dataset_path,
+        os.path.join(args.dataset_path, "metadata.csv"),
+        args,
+        condition_frames=args.condition_frames,  # 🔧 传递参数
+        target_frames=args.target_frames,        # 🔧 传递参数
+    )
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        shuffle=False,
+        batch_size=1,
+        num_workers=args.dataloader_num_workers
+    )
+    # 5. Inference
+    for batch_idx, batch in enumerate(dataloader):
+        target_text = batch["text"]
+        source_video = batch["video"]
+        target_camera = batch["camera"]
+        video = pipe(
+            prompt=target_text,
+            negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的��景，三条腿，背景人很多，倒着走",
+            source_video=source_video,
+            target_camera=target_camera,
+            cfg_scale=args.cfg_scale,
+            num_inference_steps=50,
+            seed=0,
+            tiled=True,
+            condition_frames=args.condition_frames,
+            target_frames=args.target_frames,
+        )
+        save_video(video, os.path.join(output_dir, f"video{batch_idx}.mp4"), fps=30, quality=5)

scripts/infer_rlbench.py ADDED Viewed

	@@ -0,0 +1,447 @@

+import os
+import torch
+import numpy as np
+from PIL import Image
+import imageio
+import json
+from diffsynth import WanVideoReCamMasterPipeline, ModelManager
+import argparse
+from torchvision.transforms import v2
+from einops import rearrange
+import torch.nn as nn
+def load_encoded_video_from_pth(pth_path, start_frame=0, num_frames=10):
+    """
+    从pth文件加载预编码的视频数据
+    Args:
+        pth_path: pth文件路径
+        start_frame: 起始帧索引（基于压缩后的latent帧数）
+        num_frames: 需要的帧数（基于压缩后的latent帧数）
+    Returns:
+        condition_latents: [C, T, H, W] 格式的latent tensor
+    """
+    print(f"Loading encoded video from {pth_path}")
+    # 加载编码数据
+    encoded_data = torch.load(pth_path, weights_only=False, map_location="cpu")
+    # 获取latent数据
+    full_latents = encoded_data['latents']  # [C, T, H, W]
+    print(f"Full latents shape: {full_latents.shape}")
+    print(f"Extracting frames {start_frame} to {start_frame + num_frames}")
+    # 检查帧数是否足够
+    if start_frame + num_frames > full_latents.shape[1]:
+        raise ValueError(f"Not enough frames: requested {start_frame + num_frames}, available {full_latents.shape[1]}")
+    # 提取指定帧数
+    condition_latents = full_latents[:, start_frame:start_frame + num_frames, :, :]
+    print(f"Extracted condition latents shape: {condition_latents.shape}")
+    return condition_latents, encoded_data
+def compute_relative_pose(pose_a, pose_b, use_torch=False):
+    """
+    计算相机B相对于相机A的相对位姿矩阵
+    """
+    assert pose_a.shape == (4, 4), f"相机A外参矩阵形状应为(4,4)，实际为{pose_a.shape}"
+    assert pose_b.shape == (4, 4), f"相机B外参矩阵形状应为(4,4)，实际为{pose_b.shape}"
+    if use_torch:
+        if not isinstance(pose_a, torch.Tensor):
+            pose_a = torch.from_numpy(pose_a).float()
+        if not isinstance(pose_b, torch.Tensor):
+            pose_b = torch.from_numpy(pose_b).float()
+        pose_a_inv = torch.inverse(pose_a)
+        relative_pose = torch.matmul(pose_b, pose_a_inv)
+    else:
+        if not isinstance(pose_a, np.ndarray):
+            pose_a = np.array(pose_a, dtype=np.float32)
+        if not isinstance(pose_b, np.ndarray):
+            pose_b = np.array(pose_b, dtype=np.float32)
+        pose_a_inv = np.linalg.inv(pose_a)
+        relative_pose = np.matmul(pose_b, pose_a_inv)
+    return relative_pose
+def generate_camera_poses_from_data(cam_data, start_frame, condition_frames, target_frames):
+    """
+    从实际相机数据生成pose embeddings
+    Args:
+        cam_data: 相机外参数据
+        start_frame: 起始帧（原始帧索引）
+        condition_frames: 条件帧数（压缩后）
+        target_frames: 目标帧数（压缩后）
+    """
+    time_compression_ratio = 4
+    total_frames = condition_frames + target_frames
+    # 获取相机外参序列
+    cam_extrinsic = cam_data  # [N, 4, 4]
+    # 计算原始帧索引
+    start_frame_original = start_frame * time_compression_ratio
+    end_frame_original = (start_frame + total_frames) * time_compression_ratio
+    print(f"Using camera data from frame {start_frame_original} to {end_frame_original}")
+    # 计算相对pose
+    relative_poses = []
+    for i in range(total_frames):
+        frame_idx = start_frame_original + i * time_compression_ratio
+        next_frame_idx = frame_idx + time_compression_ratio
+        cam_prev = cam_extrinsic[frame_idx]
+        relative_poses.append(torch.as_tensor(cam_prev))  # 取前3行
+        print(cam_prev)
+    # 组装pose embedding
+    pose_embedding = torch.stack(relative_poses, dim=0)
+    # print('pose_embedding init:',pose_embedding[0])
+    print('pose_embedding:',pose_embedding)
+    # assert False
+    # pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')  # [frames, 12]
+    # 添加mask信息
+    mask = torch.zeros(total_frames, dtype=torch.float32)
+    mask[:condition_frames] = 1.0  # condition frames
+    mask = mask.view(-1, 1)
+    # 组合pose和mask
+    camera_embedding = torch.cat([pose_embedding, mask], dim=1)  # [frames, 13]
+    print(f"Generated camera embedding shape: {camera_embedding.shape}")
+    return camera_embedding.to(torch.bfloat16)
+def generate_camera_poses(direction="forward", target_frames=10, condition_frames=20):
+    """
+    根据指定方向生成相机pose序列（合成数据）
+    """
+    time_compression_ratio = 4
+    total_frames = condition_frames + target_frames
+    poses = []
+    for i in range(total_frames):
+        t = i / max(1, total_frames - 1)  # 0 to 1
+        # 创建变换矩阵
+        pose = np.eye(4, dtype=np.float32)
+        if direction == "forward":
+            # 前进：沿z轴负方向移动
+            pose[2, 3] = -t * 0.04
+            print('forward!')
+        elif direction == "backward":
+            # 后退：沿z轴正方向移动
+            pose[2, 3] = t * 2.0
+        elif direction == "left_turn":
+            # 左转：前进 + 绕y轴旋转
+            pose[2, 3] = -t * 0.03  # 前进
+            pose[0, 3] = t * 0.02   # 左移
+            # 添加旋转
+            yaw = t * 1
+            pose[0, 0] = np.cos(yaw)
+            pose[0, 2] = np.sin(yaw)
+            pose[2, 0] = -np.sin(yaw)
+            pose[2, 2] = np.cos(yaw)
+        elif direction == "right_turn":
+            # 右转：前进 + 绕y轴反向旋转
+            pose[2, 3] = -t * 0.03 # 前进
+            pose[0, 3] = -t * 0.02 # 右移
+            # 添加旋转
+            yaw = - t * 1
+            pose[0, 0] = np.cos(yaw)
+            pose[0, 2] = np.sin(yaw)
+            pose[2, 0] = -np.sin(yaw)
+            pose[2, 2] = np.cos(yaw)
+        poses.append(pose)
+    # 计算相对pose
+    relative_poses = []
+    for i in range(len(poses) - 1):
+        relative_pose = compute_relative_pose(poses[i], poses[i + 1])
+        relative_poses.append(torch.as_tensor(relative_pose[:3, :]))  # 取前3行
+    # 为了匹配模型输入，需要确保帧数正确
+    if len(relative_poses) < total_frames:
+        # 补充最后一帧
+        relative_poses.append(relative_poses[-1])
+    pose_embedding = torch.stack(relative_poses[:total_frames], dim=0)
+    print('pose_embedding init:',pose_embedding[0])
+    print('pose_embedding:',pose_embedding[-5:])
+    pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')  # [frames, 12]
+    # 添加mask信息
+    mask = torch.zeros(total_frames, dtype=torch.float32)
+    mask[:condition_frames] = 1.0  # condition frames
+    mask = mask.view(-1, 1)
+    # 组合pose和mask
+    camera_embedding = torch.cat([pose_embedding, mask], dim=1)  # [frames, 13]
+    print(f"Generated {direction} movement poses:")
+    print(f"  Total frames: {total_frames}")
+    print(f"  Camera embedding shape: {camera_embedding.shape}")
+    return camera_embedding.to(torch.bfloat16)
+def inference_sekai_video_from_pth(
+    condition_pth_path,
+    dit_path,
+    output_path="sekai/infer_results/output_sekai.mp4",
+    start_frame=0,
+    condition_frames=10,  # 压缩后的帧数
+    target_frames=2,      # 压缩后的帧数
+    device="cuda",
+    prompt="a robotic arm executing precise manipulation tasks on a clean, organized desk",
+    direction="forward",
+    use_real_poses=True
+):
+    """
+    从pth文件进行Sekai视频推理
+    """
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    print(f"Setting up models for {direction} movement...")
+    # 1. Load models
+    model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+    model_manager.load_models([
+        "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
+        "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+    ])
+    pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager, device="cuda")
+    # Add camera components to DiT
+    dim = pipe.dit.blocks[0].self_attn.q.weight.shape[0]
+    for block in pipe.dit.blocks:
+        block.cam_encoder = nn.Linear(30, dim)  # 13维embedding (12D pose + 1D mask)
+        block.projector = nn.Linear(dim, dim)
+        block.cam_encoder.weight.data.zero_()
+        block.cam_encoder.bias.data.zero_()
+        block.projector.weight = nn.Parameter(torch.eye(dim))
+        block.projector.bias = nn.Parameter(torch.zeros(dim))
+    # Load trained DiT weights
+    dit_state_dict = torch.load(dit_path, map_location="cpu")
+    pipe.dit.load_state_dict(dit_state_dict, strict=True)
+    pipe = pipe.to(device)
+    pipe.scheduler.set_timesteps(50)
+    print("Loading condition video from pth...")
+    # Load condition video from pth
+    condition_latents, encoded_data = load_encoded_video_from_pth(
+        condition_pth_path,
+        start_frame=start_frame,
+        num_frames=condition_frames
+    )
+    condition_latents = condition_latents.unsqueeze(0).to(device, dtype=pipe.torch_dtype)
+    print("Processing poses...")
+    # 生成相机pose embedding
+    if use_real_poses and 'cam_emb' in encoded_data:
+        print("Using real camera poses from data")
+        camera_embedding = generate_camera_poses_from_data(
+            encoded_data['cam_emb'],
+            start_frame=start_frame,
+            condition_frames=condition_frames,
+            target_frames=target_frames
+        )
+    else:
+        print(f"Using synthetic {direction} poses")
+        camera_embedding = generate_camera_poses(
+            direction=direction,
+            target_frames=target_frames,
+            condition_frames=condition_frames
+        )
+    camera_embedding = camera_embedding.unsqueeze(0).to(device, dtype=torch.bfloat16)
+    print(f"Camera embedding shape: {camera_embedding.shape}")
+    print("Encoding prompt...")
+    # Encode text prompt
+    prompt_emb = pipe.encode_prompt(prompt)
+    print("Generating video...")
+    # Generate target latents
+    batch_size = 1
+    channels = condition_latents.shape[1]
+    latent_height = condition_latents.shape[3]
+    latent_width = condition_latents.shape[4]
+    # 空间裁剪以节省内存（如果需要）
+    target_height, target_width = 64, 64
+    if latent_height > target_height or latent_width > target_width:
+        # 中心裁剪
+        h_start = (latent_height - target_height) // 2
+        w_start = (latent_width - target_width) // 2
+        condition_latents = condition_latents[:, :, :,
+                        h_start:h_start+target_height,
+                        w_start:w_start+target_width]
+        latent_height = target_height
+        latent_width = target_width
+    # Initialize target latents with noise
+    target_latents = torch.randn(
+        batch_size, channels, target_frames, latent_height, latent_width,
+        device=device, dtype=pipe.torch_dtype
+    )
+    print(f"Condition latents shape: {condition_latents.shape}")
+    print(f"Target latents shape: {target_latents.shape}")
+    print(f"Camera embedding shape: {camera_embedding.shape}")
+    # Combine condition and target latents
+    combined_latents = torch.cat([condition_latents, target_latents], dim=2)
+    print(f"Combined latents shape: {combined_latents.shape}")
+    # Prepare extra inputs
+    extra_input = pipe.prepare_extra_input(combined_latents)
+    # Denoising loop
+    timesteps = pipe.scheduler.timesteps
+    for i, timestep in enumerate(timesteps):
+        print(f"Denoising step {i+1}/{len(timesteps)}")
+        # Prepare timestep
+        timestep_tensor = timestep.unsqueeze(0).to(device, dtype=pipe.torch_dtype)
+        # Predict noise
+        with torch.no_grad():
+            noise_pred = pipe.dit(
+                combined_latents,
+                timestep=timestep_tensor,
+                cam_emb=camera_embedding,
+                **prompt_emb,
+                **extra_input
+            )
+        # Update only target part
+        target_noise_pred = noise_pred[:, :, condition_frames:, :, :]
+        target_latents = pipe.scheduler.step(target_noise_pred, timestep, target_latents)
+        # Update combined latents
+        combined_latents[:, :, condition_frames:, :, :] = target_latents
+    print("Decoding video...")
+    # Decode final video
+    final_video = torch.cat([condition_latents, target_latents], dim=2)
+    decoded_video = pipe.decode_video(final_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16))
+    # Save video
+    print(f"Saving video to {output_path}")
+    # Convert to numpy and save
+    video_np = decoded_video[0].to(torch.float32).permute(1, 2, 3, 0).cpu().numpy()
+    video_np = (video_np * 0.5 + 0.5).clip(0, 1)  # Denormalize
+    video_np = (video_np * 255).astype(np.uint8)
+    with imageio.get_writer(output_path, fps=20) as writer:
+        for frame in video_np:
+            writer.append_data(frame)
+    print(f"Video generation completed! Saved to {output_path}")
+def main():
+    parser = argparse.ArgumentParser(description="Sekai Video Generation Inference from PTH")
+    parser.add_argument("--condition_pth", type=str,
+                       default="/share_zhuyixuan05/zhuyixuan05/rlbench/OpenBox_demo_49/encoded_video.pth")
+    parser.add_argument("--start_frame", type=int, default=0,
+                       help="Starting frame index (compressed latent frames)")
+    parser.add_argument("--condition_frames", type=int, default=8,
+                       help="Number of condition frames (compressed latent frames)")
+    parser.add_argument("--target_frames", type=int, default=8,
+                       help="Number of target frames to generate (compressed latent frames)")
+    parser.add_argument("--direction", type=str, default="left_turn",
+                       choices=["forward", "backward", "left_turn", "right_turn"],
+                       help="Direction of camera movement (if not using real poses)")
+    parser.add_argument("--use_real_poses",  default=False,
+                       help="Use real camera poses from data")
+    parser.add_argument("--dit_path", type=str, default="/home/zhuyixuan05/ReCamMaster/RLBench-train/step2000_dynamic.ckpt",
+                       help="Path to trained DiT checkpoint")
+    parser.add_argument("--output_path", type=str, default='/home/zhuyixuan05/ReCamMaster/rlbench/infer_results/output_rl_2.mp4',
+                       help="Output video path")
+    parser.add_argument("--prompt", type=str,
+                       default="a robotic arm executing precise manipulation tasks on a clean, organized desk",
+                       help="Text prompt for generation")
+    parser.add_argument("--device", type=str, default="cuda",
+                       help="Device to run inference on")
+    args = parser.parse_args()
+    # 生成输出路径
+    if args.output_path is None:
+        pth_filename = os.path.basename(args.condition_pth)
+        name_parts = os.path.splitext(pth_filename)
+        output_dir = "rlbench/infer_results"
+        os.makedirs(output_dir, exist_ok=True)
+        if args.use_real_poses:
+            output_filename = f"{name_parts[0]}_real_poses_{args.start_frame}_{args.condition_frames}_{args.target_frames}.mp4"
+        else:
+            output_filename = f"{name_parts[0]}_{args.direction}_{args.start_frame}_{args.condition_frames}_{args.target_frames}.mp4"
+        output_path = os.path.join(output_dir, output_filename)
+    else:
+        output_path = args.output_path
+    print(f"Input pth: {args.condition_pth}")
+    print(f"Start frame: {args.start_frame} (compressed)")
+    print(f"Condition frames: {args.condition_frames} (compressed, original: {args.condition_frames * 4})")
+    print(f"Target frames: {args.target_frames} (compressed, original: {args.target_frames * 4})")
+    print(f"Use real poses: {args.use_real_poses}")
+    print(f"Output video will be saved to: {output_path}")
+    inference_sekai_video_from_pth(
+        condition_pth_path=args.condition_pth,
+        dit_path=args.dit_path,
+        output_path=output_path,
+        start_frame=args.start_frame,
+        condition_frames=args.condition_frames,
+        target_frames=args.target_frames,
+        device=args.device,
+        prompt=args.prompt,
+        direction=args.direction,
+        use_real_poses=args.use_real_poses
+    )
+if __name__ == "__main__":
+    main()