studyOverflow commited on Jan 24

Commit

b171568

verified ·

1 Parent(s): 45d12c1

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fastvideo/config_sd/__pycache__/base.cpython-310.pyc +0 -0
fastvideo/config_sd/base.py +113 -0
fastvideo/config_sd/dgx.py +60 -0
fastvideo/data_preprocess/.DS_Store +0 -0
fastvideo/data_preprocess/preprocess_flux_embedding.py +170 -0
fastvideo/data_preprocess/preprocess_flux_embedding_rlpt.py +172 -0
fastvideo/data_preprocess/preprocess_flux_rfpt_embedding.py +224 -0
fastvideo/data_preprocess/preprocess_qwenimage_embedding.py +220 -0
fastvideo/data_preprocess/preprocess_rl_embeddings.py +175 -0
fastvideo/data_preprocess/preprocess_text_embeddings.py +175 -0
fastvideo/data_preprocess/preprocess_vae_latents.py +137 -0
fastvideo/data_preprocess/preprocess_validation_text_embeddings.py +80 -0
fastvideo/dataset/.DS_Store +0 -0
fastvideo/dataset/__init__.py +104 -0
fastvideo/dataset/__pycache__/__init__.cpython-310.pyc +0 -0
fastvideo/dataset/__pycache__/__init__.cpython-312.pyc +0 -0
fastvideo/dataset/__pycache__/latent_flux_rfpt_datasets.cpython-312.pyc +0 -0
fastvideo/dataset/__pycache__/latent_flux_rfpt_datasets_all.cpython-312.pyc +0 -0
fastvideo/dataset/__pycache__/latent_flux_rl_datasets.cpython-312.pyc +0 -0
fastvideo/dataset/__pycache__/latent_qwenimage_rl_datasets.cpython-310.pyc +0 -0
fastvideo/dataset/__pycache__/t2v_datasets.cpython-310.pyc +0 -0
fastvideo/dataset/__pycache__/t2v_datasets.cpython-312.pyc +0 -0
fastvideo/dataset/__pycache__/transform.cpython-310.pyc +0 -0
fastvideo/dataset/__pycache__/transform.cpython-312.pyc +0 -0
fastvideo/dataset/latent_datasets.py +132 -0
fastvideo/dataset/latent_flux_rfpt_datasets.py +122 -0
fastvideo/dataset/latent_flux_rfpt_datasets_all.py +134 -0
fastvideo/dataset/latent_flux_rl_datasets.py +110 -0
fastvideo/dataset/latent_qwenimage_rl_datasets.py +90 -0
fastvideo/dataset/latent_rl_datasets.py +99 -0
fastvideo/dataset/t2v_datasets.py +351 -0
fastvideo/dataset/transform.py +647 -0
fastvideo/distill/__init__.py +0 -0
fastvideo/distill/__pycache__/__init__.cpython-312.pyc +0 -0
fastvideo/distill/__pycache__/solver.cpython-312.pyc +0 -0
fastvideo/distill/discriminator.py +84 -0
fastvideo/distill/solver.py +310 -0
fastvideo/models/.DS_Store +0 -0
fastvideo/models/__pycache__/flash_attn_no_pad.cpython-310.pyc +0 -0
fastvideo/models/__pycache__/flash_attn_no_pad.cpython-312.pyc +0 -0
fastvideo/models/flash_attn_no_pad.py +37 -0
fastvideo/reward_model/clip_score.py +98 -0
fastvideo/reward_model/hps_score.py +79 -0
fastvideo/reward_model/image_reward.py +40 -0
fastvideo/reward_model/pick_score.py +107 -0
fastvideo/reward_model/unified_reward.py +333 -0
fastvideo/reward_model/utils.py +126 -0
fastvideo/utils/.DS_Store +0 -0
fastvideo/utils/checkpoint.py +314 -0
fastvideo/utils/communications.py +335 -0

fastvideo/config_sd/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (1.26 kB). View file

fastvideo/config_sd/base.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import ml_collections
+def get_config():
+    config = ml_collections.ConfigDict()
+    ###### General ######
+    # run name for wandb logging and checkpoint saving -- if not provided, will be auto-generated based on the datetime.
+    config.run_name = ""
+    # random seed for reproducibility.
+    config.seed = 42
+    # top-level logging directory for checkpoint saving.
+    config.logdir = "logs"
+    # number of epochs to train for. each epoch is one round of sampling from the model followed by training on those
+    # samples.
+    config.num_epochs = 300
+    # number of epochs between saving model checkpoints.
+    config.save_freq = 20
+    # number of checkpoints to keep before overwriting old ones.
+    config.num_checkpoint_limit = 5
+    # mixed precision training. options are "fp16", "bf16", and "no". half-precision speeds up training significantly.
+    config.mixed_precision = "bf16"
+    # allow tf32 on Ampere GPUs, which can speed up training.
+    config.allow_tf32 = True
+    # resume training from a checkpoint. either an exact checkpoint directory (e.g. checkpoint_50), or a directory
+    # containing checkpoints, in which case the latest one will be used. `config.use_lora` must be set to the same value
+    # as the run that generated the saved checkpoint.
+    config.resume_from = ""
+    # whether or not to use LoRA. LoRA reduces memory usage significantly by injecting small weight matrices into the
+    # attention layers of the UNet. with LoRA, fp16, and a batch size of 1, finetuning Stable Diffusion should take
+    # about 10GB of GPU memory. beware that if LoRA is disabled, training will take a lot of memory and saved checkpoint
+    # files will also be large.
+    config.use_lora = False
+    ###### Pretrained Model ######
+    config.pretrained = pretrained = ml_collections.ConfigDict()
+    # base model to load. either a path to a local directory, or a model name from the HuggingFace model hub.
+    pretrained.model = "./data/StableDiffusion"
+    # revision of the model to load.
+    pretrained.revision = "main"
+    ###### Sampling ######
+    config.sample = sample = ml_collections.ConfigDict()
+    # number of sampler inference steps.
+    sample.num_steps = 50
+    # eta parameter for the DDIM sampler. this controls the amount of noise injected into the sampling process, with 0.0
+    # being fully deterministic and 1.0 being equivalent to the DDPM sampler.
+    sample.eta = 1.0
+    # classifier-free guidance weight. 1.0 is no guidance.
+    sample.guidance_scale = 5.0
+    # batch size (per GPU!) to use for sampling.
+    sample.batch_size = 1
+    # number of batches to sample per epoch. the total number of samples per epoch is `num_batches_per_epoch *
+    # batch_size * num_gpus`.
+    sample.num_batches_per_epoch = 2
+    ###### Training ######
+    config.train = train = ml_collections.ConfigDict()
+    # batch size (per GPU!) to use for training.
+    train.batch_size = 1
+    # whether to use the 8bit Adam optimizer from bitsandbytes.
+    train.use_8bit_adam = False
+    # learning rate.
+    train.learning_rate = 1e-5
+    # Adam beta1.
+    train.adam_beta1 = 0.9
+    # Adam beta2.
+    train.adam_beta2 = 0.999
+    # Adam weight decay.
+    train.adam_weight_decay = 1e-4
+    # Adam epsilon.
+    train.adam_epsilon = 1e-8
+    # number of gradient accumulation steps. the effective batch size is `batch_size * num_gpus *
+    # gradient_accumulation_steps`.
+    train.gradient_accumulation_steps = 1
+    # maximum gradient norm for gradient clipping.
+    train.max_grad_norm = 1.0
+    # number of inner epochs per outer epoch. each inner epoch is one iteration through the data collected during one
+    # outer epoch's round of sampling.
+    train.num_inner_epochs = 1
+    # whether or not to use classifier-free guidance during training. if enabled, the same guidance scale used during
+    # sampling will be used during training.
+    train.cfg = True
+    # clip advantages to the range [-adv_clip_max, adv_clip_max].
+    train.adv_clip_max = 5
+    # the PPO clip range.
+    train.clip_range = 1e-4
+    # the fraction of timesteps to train on. if set to less than 1.0, the model will be trained on a subset of the
+    # timesteps for each sample. this will speed up training but reduce the accuracy of policy gradient estimates.
+    train.timestep_fraction = 1.0
+    ###### Prompt Function ######
+    # prompt function to use. see `prompts.py` for available prompt functions.
+    config.prompt_fn = "imagenet_animals"
+    # kwargs to pass to the prompt function.
+    config.prompt_fn_kwargs = {}
+    ###### Reward Function ######
+    # reward function to use. see `rewards.py` for available reward functions.
+    config.reward_fn = "hpsv2"
+    ###### Per-Prompt Stat Tracking ######
+    # when enabled, the model will track the mean and std of reward on a per-prompt basis and use that to compute
+    # advantages. set `config.per_prompt_stat_tracking` to None to disable per-prompt stat tracking, in which case
+    # advantages will be calculated using the mean and std of the entire batch.
+    #config.per_prompt_stat_tracking = ml_collections.ConfigDict()
+    # number of reward values to store in the buffer for each prompt. the buffer persists across epochs.
+    #config.per_prompt_stat_tracking.buffer_size = 16
+    # the minimum number of reward values to store in the buffer before using the per-prompt mean and std. if the buffer
+    # contains fewer than `min_count` values, the mean and std of the entire batch will be used instead.
+    #config.per_prompt_stat_tracking.min_count = 16
+    return config

fastvideo/config_sd/dgx.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import ml_collections
+import imp
+import os
+base = imp.load_source("base", os.path.join(os.path.dirname(__file__), "base.py"))
+def compressibility():
+    config = base.get_config()
+    config.pretrained.model = "CompVis/stable-diffusion-v1-4"
+    config.num_epochs = 300
+    config.save_freq = 50
+    config.num_checkpoint_limit = 100000000
+    # the DGX machine I used had 8 GPUs, so this corresponds to 8 * 8 * 4 = 256 samples per epoch.
+    config.sample.batch_size = 8
+    config.sample.num_batches_per_epoch = 4
+    # this corresponds to (8 * 4) / (4 * 2) = 4 gradient updates per epoch.
+    config.train.batch_size = 1
+    config.train.gradient_accumulation_steps = 4
+    # prompting
+    config.prompt_fn = "imagenet_animals"
+    config.prompt_fn_kwargs = {}
+    # rewards
+    config.reward_fn = "jpeg_compressibility"
+    config.per_prompt_stat_tracking = {
+        "buffer_size": 16,
+        "min_count": 16,
+    }
+    return config
+def hps():
+    config = compressibility()
+    config.num_epochs = 300
+    config.reward_fn = "aesthetic_score"
+    # this reward is a bit harder to optimize, so I used 2 gradient updates per epoch.
+    config.train.gradient_accumulation_steps = 8
+    # the DGX machine I used had 8 GPUs, so this corresponds to 8 * 8 * 4 = 256 samples per epoch.
+    config.sample.batch_size = 4
+    # this corresponds to (8 * 4) / (4 * 2) = 4 gradient updates per epoch.
+    config.train.batch_size = 4
+    config.prompt_fn = "aes"
+    config.chosen_number = 16
+    config.num_generations = 16
+    return config
+def get_config(name):
+    return globals()[name]()

fastvideo/data_preprocess/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

fastvideo/data_preprocess/preprocess_flux_embedding.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright (c) [2025] [FastVideo Team]
+# Copyright (c) [2025] [ByteDance Ltd. and/or its affiliates.]
+# SPDX-License-Identifier: [Apache License 2.0]
+#
+# This file has been modified by [ByteDance Ltd. and/or its affiliates.] in 2025.
+#
+# Original file was released under [Apache License 2.0], with the full license text
+# available at [https://github.com/hao-ai-lab/FastVideo/blob/main/LICENSE].
+#
+# This modified file is released under the same license.
+import argparse
+import torch
+from accelerate.logging import get_logger
+import cv2
+import json
+import os
+import torch.distributed as dist
+from pathlib import Path
+logger = get_logger(__name__)
+from torch.utils.data import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import re
+from diffusers import FluxPipeline
+def contains_chinese(text):
+    return bool(re.search(r'[\u4e00-\u9fff]', text))
+class T5dataset(Dataset):
+    def __init__(
+        self, txt_path, vae_debug,
+    ):
+        self.txt_path = txt_path
+        self.vae_debug = vae_debug
+        with open(self.txt_path, "r", encoding="utf-8") as f:
+            self.train_dataset = [
+        line for line in f.read().splitlines() if not contains_chinese(line)
+        ][:50000]
+    def __getitem__(self, idx):
+        #import pdb;pdb.set_trace()
+        caption = self.train_dataset[idx]
+        filename = str(idx)
+        #length = self.train_dataset[idx]["length"]
+        if self.vae_debug:
+            latents = torch.load(
+                os.path.join(
+                    args.output_dir, "latent", self.train_dataset[idx]["latent_path"]
+                ),
+                map_location="cpu",
+            )
+        else:
+            latents = []
+        return dict(caption=caption, latents=latents, filename=filename)
+    def __len__(self):
+        return len(self.train_dataset)
+def main(args):
+    local_rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    print("world_size", world_size, "local rank", local_rank)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    torch.cuda.set_device(local_rank)
+    if not dist.is_initialized():
+        dist.init_process_group(
+            backend="nccl", init_method="env://", world_size=world_size, rank=local_rank
+        )
+    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "prompt_embed"), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "text_ids"), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "pooled_prompt_embeds"), exist_ok=True)
+    latents_txt_path = args.prompt_dir
+    train_dataset = T5dataset(latents_txt_path, args.vae_debug)
+    sampler = DistributedSampler(
+        train_dataset, rank=local_rank, num_replicas=world_size, shuffle=True
+    )
+    train_dataloader = DataLoader(
+        train_dataset,
+        sampler=sampler,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+    flux_path = args.model_path
+    pipe = FluxPipeline.from_pretrained(flux_path, torch_dtype=torch.bfloat16).to(device)
+    json_data = []
+    for _, data in tqdm(enumerate(train_dataloader), disable=local_rank != 0):
+        try:
+            with torch.inference_mode():
+                    if args.vae_debug:
+                        latents = data["latents"]
+                    for idx, video_name in enumerate(data["filename"]):
+                        prompt_embeds, pooled_prompt_embeds, text_ids = pipe.encode_prompt(
+                            prompt=data["caption"], prompt_2=data["caption"]
+                        )
+                        prompt_embed_path = os.path.join(
+                            args.output_dir, "prompt_embed", video_name + ".pt"
+                        )
+                        pooled_prompt_embeds_path = os.path.join(
+                            args.output_dir, "pooled_prompt_embeds", video_name + ".pt"
+                        )
+                        text_ids_path = os.path.join(
+                            args.output_dir, "text_ids", video_name + ".pt"
+                        )
+                        # save latent
+                        torch.save(prompt_embeds[idx], prompt_embed_path)
+                        torch.save(pooled_prompt_embeds[idx], pooled_prompt_embeds_path)
+                        torch.save(text_ids[idx], text_ids_path)
+                        item = {}
+                        item["prompt_embed_path"] = video_name + ".pt"
+                        item["text_ids"] = video_name + ".pt"
+                        item["pooled_prompt_embeds_path"] = video_name + ".pt"
+                        item["caption"] = data["caption"][idx]
+                        json_data.append(item)
+        except Exception as e:
+            print(f"Rank {local_rank} Error: {repr(e)}")
+            dist.barrier()
+            raise
+    dist.barrier()
+    local_data = json_data
+    gathered_data = [None] * world_size
+    dist.all_gather_object(gathered_data, local_data)
+    if local_rank == 0:
+        # os.remove(latents_json_path)
+        all_json_data = [item for sublist in gathered_data for item in sublist]
+        with open(os.path.join(args.output_dir, "videos2caption.json"), "w") as f:
+            json.dump(all_json_data, f, indent=4)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # dataset & dataloader
+    parser.add_argument("--model_path", type=str, default="data/mochi")
+    parser.add_argument("--model_type", type=str, default="mochi")
+    # text encoder & vae & diffusion model
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=1,
+        help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.",
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=1,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument("--text_encoder_name", type=str, default="google/t5-v1_1-xxl")
+    parser.add_argument("--cache_dir", type=str, default="./cache_dir")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--vae_debug", action="store_true")
+    parser.add_argument("--prompt_dir", type=str, default="./empty.txt")
+    args = parser.parse_args()
+    main(args)

fastvideo/data_preprocess/preprocess_flux_embedding_rlpt.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# Copyright (c) [2025] [FastVideo Team]
+# Copyright (c) [2025] [ByteDance Ltd. and/or its affiliates.]
+# SPDX-License-Identifier: [Apache License 2.0]
+#
+# This file has been modified by [ByteDance Ltd. and/or its affiliates.] in 2025.
+#
+# Original file was released under [Apache License 2.0], with the full license text
+# available at [https://github.com/hao-ai-lab/FastVideo/blob/main/LICENSE].
+#
+# This modified file is released under the same license.
+import argparse
+import torch
+from accelerate.logging import get_logger
+import cv2
+import json
+import os
+import torch.distributed as dist
+from pathlib import Path
+logger = get_logger(__name__)
+from torch.utils.data import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import re
+from diffusers import FluxPipeline
+def contains_chinese(text):
+    return bool(re.search(r'[\u4e00-\u9fff]', text))
+class T5dataset(Dataset):
+    def __init__(
+        self, txt_path, vae_debug,
+    ):
+        self.txt_path = txt_path
+        self.vae_debug = vae_debug
+        print(f"[DEBUG] Loading captions from: {self.txt_path}")
+        with open(self.txt_path, "r", encoding="utf-8") as f:
+            self.train_dataset = [
+        line.strip() for line in f.read().splitlines() if line.strip() and not contains_chinese(line)
+        ][:50000]
+        print(f"[DEBUG] Loaded {len(self.train_dataset)} captions after filtering")
+    def __getitem__(self, idx):
+        #import pdb;pdb.set_trace()
+        caption = self.train_dataset[idx]
+        filename = str(idx)
+        #length = self.train_dataset[idx]["length"]
+        if self.vae_debug:
+            latents = torch.load(
+                os.path.join(
+                    args.output_dir, "latent", self.train_dataset[idx]["latent_path"]
+                ),
+                map_location="cpu",
+            )
+        else:
+            latents = []
+        return dict(caption=caption, latents=latents, filename=filename)
+    def __len__(self):
+        return len(self.train_dataset)
+def main(args):
+    local_rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    print("world_size", world_size, "local rank", local_rank)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    torch.cuda.set_device(local_rank)
+    if not dist.is_initialized():
+        dist.init_process_group(
+            backend="nccl", init_method="env://", world_size=world_size, rank=local_rank
+        )
+    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "prompt_embed"), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "text_ids"), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "pooled_prompt_embeds"), exist_ok=True)
+    latents_txt_path = args.prompt_dir
+    train_dataset = T5dataset(latents_txt_path, args.vae_debug)
+    sampler = DistributedSampler(
+        train_dataset, rank=local_rank, num_replicas=world_size, shuffle=True
+    )
+    train_dataloader = DataLoader(
+        train_dataset,
+        sampler=sampler,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+    flux_path = args.model_path
+    pipe = FluxPipeline.from_pretrained(flux_path, torch_dtype=torch.bfloat16).to(device)
+    json_data = []
+    for _, data in tqdm(enumerate(train_dataloader), disable=local_rank != 0):
+        try:
+            with torch.inference_mode():
+                    if args.vae_debug:
+                        latents = data["latents"]
+                    for idx, video_name in enumerate(data["filename"]):
+                        prompt_embeds, pooled_prompt_embeds, text_ids = pipe.encode_prompt(
+                            prompt=data["caption"], prompt_2=data["caption"]
+                        )
+                        prompt_embed_path = os.path.join(
+                            args.output_dir, "prompt_embed", video_name + ".pt"
+                        )
+                        pooled_prompt_embeds_path = os.path.join(
+                            args.output_dir, "pooled_prompt_embeds", video_name + ".pt"
+                        )
+                        text_ids_path = os.path.join(
+                            args.output_dir, "text_ids", video_name + ".pt"
+                        )
+                        # save latent
+                        torch.save(prompt_embeds[idx], prompt_embed_path)
+                        torch.save(pooled_prompt_embeds[idx], pooled_prompt_embeds_path)
+                        torch.save(text_ids[idx], text_ids_path)
+                        item = {}
+                        item["prompt_embed_path"] = video_name + ".pt"
+                        item["text_ids"] = video_name + ".pt"
+                        item["pooled_prompt_embeds_path"] = video_name + ".pt"
+                        item["caption"] = data["caption"][idx]
+                        json_data.append(item)
+        except Exception as e:
+            print(f"Rank {local_rank} Error: {repr(e)}")
+            dist.barrier()
+            raise
+    dist.barrier()
+    local_data = json_data
+    gathered_data = [None] * world_size
+    dist.all_gather_object(gathered_data, local_data)
+    if local_rank == 0:
+        # os.remove(latents_json_path)
+        all_json_data = [item for sublist in gathered_data for item in sublist]
+        with open(os.path.join(args.output_dir, "videos2caption.json"), "w") as f:
+            json.dump(all_json_data, f, indent=4)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # dataset & dataloader
+    parser.add_argument("--model_path", type=str, default="data/mochi")
+    parser.add_argument("--model_type", type=str, default="mochi")
+    # text encoder & vae & diffusion model
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=1,
+        help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.",
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=1,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument("--text_encoder_name", type=str, default="google/t5-v1_1-xxl")
+    parser.add_argument("--cache_dir", type=str, default="./cache_dir")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--vae_debug", action="store_true")
+    parser.add_argument("--prompt_dir", type=str, default="./empty.txt")
+    args = parser.parse_args()
+    main(args)

fastvideo/data_preprocess/preprocess_flux_rfpt_embedding.py ADDED Viewed

	@@ -0,0 +1,224 @@

+# Copyright (c) [2025] [FastVideo Team]
+# Copyright (c) [2025] [ByteDance Ltd. and/or its affiliates.]
+# SPDX-License-Identifier: [Apache License 2.0]
+#
+# This file has been modified by [ByteDance Ltd. and/or its affiliates.] in 2025.
+#
+# Original file was released under [Apache License 2.0], with the full license text
+# available at [https://github.com/hao-ai-lab/FastVideo/blob/main/LICENSE].
+#
+# This modified file is released under the same license.
+import argparse
+import torch
+from accelerate.logging import get_logger
+import cv2
+import json
+import os
+import torch.distributed as dist
+import pandas as pd
+from torch.utils.data.dataset import ConcatDataset, Dataset
+import io
+import torchvision.transforms as transforms
+logger = get_logger(__name__)
+from torch.utils.data import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import re
+from diffusers import FluxPipeline
+from PIL import Image
+from diffusers.image_processor import VaeImageProcessor
+def contains_chinese(text):
+    return bool(re.search(r'[\u4e00-\u9fff]', text))
+class RFPTdataset(Dataset):
+    def __init__(
+        self, file_path,
+    ):
+        self.file_path = file_path
+        file_names = os.listdir(self.file_path) # each file contains 5,000 images
+        self.file_names = [os.path.join(self.file_path, file_name) for file_name in file_names]
+        self.train_dataset = self.read_data()
+        self.transform = transforms.ToTensor()
+    def read_data(self):
+        df_list = [pd.read_parquet(file_name) for file_name in self.file_names]
+        combined_df = pd.concat(df_list, axis=0, ignore_index=True)
+        return combined_df
+    def __len__(self):
+        return len(self.train_dataset)
+    def __getitem__(self, index):
+        image = self.train_dataset.iloc[index]['image']['bytes']
+        image = self.transform(Image.open(io.BytesIO(image)).convert('RGB'))
+        # print(image.shape)
+        caption = self.train_dataset.iloc[index]['caption_composition']
+        # print(caption)
+        filename = str(index)
+        if caption == None or image == None:
+            return self.__getitem__(index+1)
+        return dict(caption=caption, image=image, filename=filename)
+class T5dataset(Dataset):
+    def __init__(
+        self, txt_path, vae_debug,
+    ):
+        self.txt_path = txt_path
+        self.vae_debug = vae_debug
+        with open(self.txt_path, "r", encoding="utf-8") as f:
+            self.train_dataset = [
+        line for line in f.read().splitlines() if not contains_chinese(line)
+        ][:50000]
+    def __getitem__(self, idx):
+        #import pdb;pdb.set_trace()
+        caption = self.train_dataset[idx]
+        filename = str(idx)
+        #length = self.train_dataset[idx]["length"]
+        if self.vae_debug:
+            latents = torch.load(
+                os.path.join(
+                    args.output_dir, "latent", self.train_dataset[idx]["latent_path"]
+                ),
+                map_location="cpu",
+            )
+        else:
+            latents = []
+        return dict(caption=caption, latents=latents, filename=filename)
+    def __len__(self):
+        return len(self.train_dataset)
+def main(args):
+    local_rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    print("world_size", world_size, "local rank", local_rank)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    torch.cuda.set_device(local_rank)
+    if not dist.is_initialized():
+        dist.init_process_group(
+            backend="nccl", init_method="env://", world_size=world_size, rank=local_rank
+        )
+    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "prompt_embed"), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "text_ids"), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "pooled_prompt_embeds"), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "images"), exist_ok=True)
+    # latents_txt_path = args.prompt_dir
+    # train_dataset = T5dataset(latents_txt_path, args.vae_debug)
+    train_dataset = RFPTdataset(args.prompt_dir)
+    sampler = DistributedSampler(
+        train_dataset, rank=local_rank, num_replicas=world_size, shuffle=True
+    )
+    train_dataloader = DataLoader(
+        train_dataset,
+        sampler=sampler,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+    flux_path = args.model_path
+    pipe = FluxPipeline.from_pretrained(flux_path, torch_dtype=torch.bfloat16).to(device)
+    image_processor = VaeImageProcessor(16)
+    json_data = []
+    for _, data in tqdm(enumerate(train_dataloader), disable=local_rank != 0):
+        try:
+            with torch.inference_mode():
+                    if args.vae_debug:
+                        latents = data["latents"]
+                    for idx, video_name in enumerate(data["filename"]):
+                        # prompt_embeds, pooled_prompt_embeds, text_ids = pipe.encode_prompt(
+                        #     prompt=data["caption"], prompt_2=data["caption"]
+                        # )
+                        # image_latents = pipe.vae.encode(data["image"].to(torch.bfloat16).to(device)).latent_dist.sample()
+                        # output_image = pipe.vae.decode(image_latents, return_dict=False)[0]
+                        # output_image = image_processor.postprocess(output_image)
+                        # output_image[0].save('output.png')
+                        # print(image_latents.latent_dist.sample())
+                        # print(image_latents.latent_dist.sample().shape)
+                        prompt_embed_path = os.path.join(
+                            args.output_dir, "prompt_embed", video_name + ".pt"
+                        )
+                        pooled_prompt_embeds_path = os.path.join(
+                            args.output_dir, "pooled_prompt_embeds", video_name + ".pt"
+                        )
+                        text_ids_path = os.path.join(
+                            args.output_dir, "text_ids", video_name + ".pt"
+                        )
+                        image_latents_path = os.path.join(
+                            args.output_dir, "images", video_name + ".pt"
+                        )
+                        # save latent
+                        # torch.save(prompt_embeds[idx], prompt_embed_path)
+                        # torch.save(pooled_prompt_embeds[idx], pooled_prompt_embeds_path)
+                        # torch.save(text_ids[idx], text_ids_path)
+                        torch.save(data["image"].to(torch.bfloat16), image_latents_path)
+                        item = {}
+                        item["prompt_embed_path"] = video_name + ".pt"
+                        item["text_ids"] = video_name + ".pt"
+                        item["pooled_prompt_embeds_path"] = video_name + ".pt"
+                        item["caption"] = data["caption"][idx]
+                        json_data.append(item)
+        except Exception as e:
+            print(f"Rank {local_rank} Error: {repr(e)}")
+            dist.barrier()
+            raise
+    dist.barrier()
+    local_data = json_data
+    gathered_data = [None] * world_size
+    dist.all_gather_object(gathered_data, local_data)
+    if local_rank == 0:
+        # os.remove(latents_json_path)
+        all_json_data = [item for sublist in gathered_data for item in sublist]
+        with open(os.path.join(args.output_dir, "videos2caption.json"), "w") as f:
+            json.dump(all_json_data, f, indent=4)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # dataset & dataloader
+    parser.add_argument("--model_path", type=str, default="data/mochi")
+    parser.add_argument("--model_type", type=str, default="mochi")
+    # text encoder & vae & diffusion model
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=1,
+        help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.",
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=1,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument("--text_encoder_name", type=str, default="google/t5-v1_1-xxl")
+    parser.add_argument("--cache_dir", type=str, default="./cache_dir")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--vae_debug", action="store_true")
+    parser.add_argument("--prompt_dir", type=str, default="./empty.txt")
+    args = parser.parse_args()
+    main(args)

fastvideo/data_preprocess/preprocess_qwenimage_embedding.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# Copyright (c) [2025] [FastVideo Team]
+# Copyright (c) [2025] [ByteDance Ltd. and/or its affiliates.]
+# SPDX-License-Identifier: [Apache License 2.0]
+#
+# This file has been modified by [ByteDance Ltd. and/or its affiliates.] in 2025.
+#
+# Original file was released under [Apache License 2.0], with the full license text
+# available at [https://github.com/hao-ai-lab/FastVideo/blob/main/LICENSE].
+#
+# This modified file is released under the same license.
+import argparse
+import torch
+from accelerate.logging import get_logger
+# from fastvideo.models.mochi_hf.pipeline_mochi import MochiPipeline
+from diffusers.utils import export_to_video
+from fastvideo.models.qwenimage.pipeline_qwenimage import QwenImagePipeline
+import json
+import os
+import torch.distributed as dist
+logger = get_logger(__name__)
+from torch.utils.data import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import DataLoader
+from fastvideo.utils.load import load_text_encoder, load_vae
+from diffusers.video_processor import VideoProcessor
+from tqdm import tqdm
+import re
+from diffusers import DiffusionPipeline
+import torch.nn.functional as F
+def contains_chinese(text):
+    """检查字符串是否包含中文字符"""
+    return bool(re.search(r'[\u4e00-\u9fff]', text))
+class T5dataset(Dataset):
+    def __init__(
+        self, txt_path, vae_debug,
+    ):
+        self.txt_path = txt_path
+        self.vae_debug = vae_debug
+        with open(self.txt_path, "r", encoding="utf-8") as f:
+            self.train_dataset = [
+        line for line in f.read().splitlines() if not contains_chinese(line)
+        ]
+            #self.train_dataset = sorted(train_dataset)
+    def __getitem__(self, idx):
+        #import pdb;pdb.set_trace()
+        caption = self.train_dataset[idx]
+        filename = str(idx)
+        #length = self.train_dataset[idx]["length"]
+        if self.vae_debug:
+            latents = torch.load(
+                os.path.join(
+                    args.output_dir, "latent", self.train_dataset[idx]["latent_path"]
+                ),
+                map_location="cpu",
+            )
+        else:
+            latents = []
+        return dict(caption=caption, latents=latents, filename=filename)
+    def __len__(self):
+        return len(self.train_dataset)
+def main(args):
+    local_rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    print("world_size", world_size, "local rank", local_rank)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    torch.cuda.set_device(local_rank)
+    if not dist.is_initialized():
+        dist.init_process_group(
+            backend="nccl", init_method="env://", world_size=world_size, rank=local_rank
+        )
+    #videoprocessor = VideoProcessor(vae_scale_factor=8)
+    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "prompt_embed"), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "prompt_attention_mask"), exist_ok=True)
+    latents_txt_path = args.prompt_dir
+    train_dataset = T5dataset(latents_txt_path, args.vae_debug)
+    #text_encoder = load_text_encoder(args.model_type, args.model_path, device=device)
+    #vae, autocast_type, fps = load_vae(args.model_type, args.model_path)
+    #vae.enable_tiling()
+    sampler = DistributedSampler(
+        train_dataset, rank=local_rank, num_replicas=world_size, shuffle=False
+    )
+    train_dataloader = DataLoader(
+        train_dataset,
+        sampler=sampler,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+    # Load pipeline but don't move everything to GPU yet
+    pipe = QwenImagePipeline.from_pretrained(args.model_path, torch_dtype=torch.bfloat16)
+    # Only move text_encoder to GPU for embedding generation
+    pipe.text_encoder = pipe.text_encoder.to(device)
+    # Delete unused components to free up RAM/VRAM
+    if not args.vae_debug:
+        # Remove from attributes
+        if hasattr(pipe, "transformer"):
+            del pipe.transformer
+        if hasattr(pipe, "vae"):
+            del pipe.vae
+        # Remove from components dictionary to ensure garbage collection
+        if "transformer" in pipe.components:
+            del pipe.components["transformer"]
+        if "vae" in pipe.components:
+            del pipe.components["vae"]
+        import gc
+        gc.collect()
+        torch.cuda.empty_cache()
+    # pipe._execution_device = device # This causes AttributeError, removing it.
+    json_data = []
+    for _, data in tqdm(enumerate(train_dataloader), disable=local_rank != 0):
+        with torch.inference_mode():
+            with torch.autocast("cuda"):
+                prompt_embeds, prompt_attention_mask = pipe.encode_prompt(
+                    prompt=data["caption"],
+                    device=device  # Explicitly pass device
+                )
+                # ==================== 代码修改开始 ====================
+                # 1. 记录原始的序列长度 (第二个维度的大小)
+                original_length = prompt_embeds.shape[1]
+                target_length = 1024
+                # 2. 计算需要填充的长度
+                # 假设 original_length 不会超过 target_length
+                pad_len = target_length - original_length
+                # 3. 填充 prompt_embeds
+                # prompt_embeds 是一个3D张量 (B, L, D)，我们需要填充第二个维度 L
+                # F.pad 的填充参数顺序是从最后一个维度开始的 (pad_dim_D_left, pad_dim_D_right, pad_dim_L_left, pad_dim_L_right, ...)
+                # 我们在维度1（序列长度L）的右侧进行填充
+                prompt_embeds = F.pad(prompt_embeds, (0, 0, 0, pad_len), "constant", 0)
+                # 4. 填充 prompt_attention_mask
+                # prompt_attention_mask 是一个2D张量 (B, L)，我们同样填充第二个维度 L
+                # 我们在维度1（序列长度L）的右侧进行填充
+                prompt_attention_mask = F.pad(prompt_attention_mask, (0, pad_len), "constant", 0)
+                # ==================== 代码修改结束 ====================
+                if args.vae_debug:
+                    latents = data["latents"]
+                for idx, video_name in enumerate(data["filename"]):
+                    prompt_embed_path = os.path.join(
+                        args.output_dir, "prompt_embed", video_name + ".pt"
+                    )
+                    prompt_attention_mask_path = os.path.join(
+                        args.output_dir, "prompt_attention_mask", video_name + ".pt"
+                    )
+                    # 保存 latent (注意这里保存的是填充后的张量)
+                    torch.save(prompt_embeds[idx], prompt_embed_path)
+                    torch.save(prompt_attention_mask[idx], prompt_attention_mask_path)
+                    item = {}
+                    item["prompt_embed_path"] = video_name + ".pt"
+                    item["prompt_attention_mask"] = video_name + ".pt"
+                    item["caption"] = data["caption"][idx]
+                    # [新增] 将原始长度记录到 item 字典中
+                    item["original_length"] = original_length
+                    json_data.append(item)
+    dist.barrier()
+    local_data = json_data
+    gathered_data = [None] * world_size
+    dist.all_gather_object(gathered_data, local_data)
+    if local_rank == 0:
+        # os.remove(latents_json_path)
+        all_json_data = [item for sublist in gathered_data for item in sublist]
+        with open(os.path.join(args.output_dir, "videos2caption.json"), "w") as f:
+            json.dump(all_json_data, f, indent=4)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # dataset & dataloader
+    parser.add_argument("--model_path", type=str, default="data/mochi")
+    parser.add_argument("--model_type", type=str, default="mochi")
+    # text encoder & vae & diffusion model
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=1,
+        help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.",
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=1,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument("--text_encoder_name", type=str, default="google/t5-v1_1-xxl")
+    parser.add_argument("--cache_dir", type=str, default="./cache_dir")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--vae_debug", action="store_true")
+    parser.add_argument("--prompt_dir", type=str, default="./empty.txt")
+    args = parser.parse_args()
+    main(args)

fastvideo/data_preprocess/preprocess_rl_embeddings.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# Copyright (c) [2025] [FastVideo Team]
+# Copyright (c) [2025] [ByteDance Ltd. and/or its affiliates.]
+# SPDX-License-Identifier: [Apache License 2.0]
+#
+# This file has been modified by [ByteDance Ltd. and/or its affiliates.] in 2025.
+#
+# Original file was released under [Apache License 2.0], with the full license text
+# available at [https://github.com/hao-ai-lab/FastVideo/blob/main/LICENSE].
+#
+# This modified file is released under the same license.
+import argparse
+import torch
+from accelerate.logging import get_logger
+from fastvideo.models.mochi_hf.pipeline_mochi import MochiPipeline
+from diffusers.utils import export_to_video
+import json
+import os
+import torch.distributed as dist
+logger = get_logger(__name__)
+from torch.utils.data import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import DataLoader
+from fastvideo.utils.load import load_text_encoder, load_vae
+from diffusers.video_processor import VideoProcessor
+from tqdm import tqdm
+import re
+def contains_chinese(text):
+    """检查字符串是否包含中文字符"""
+    return bool(re.search(r'[\u4e00-\u9fff]', text))
+class T5dataset(Dataset):
+    def __init__(
+        self, txt_path, vae_debug,
+    ):
+        self.txt_path = txt_path
+        self.vae_debug = vae_debug
+        with open(self.txt_path, "r", encoding="utf-8") as f:
+            self.train_dataset = [
+        line for line in f.read().splitlines() if not contains_chinese(line)
+        ]
+            #self.train_dataset = sorted(train_dataset)
+    def __getitem__(self, idx):
+        #import pdb;pdb.set_trace()
+        caption = self.train_dataset[idx]
+        filename = str(idx)
+        #length = self.train_dataset[idx]["length"]
+        if self.vae_debug:
+            latents = torch.load(
+                os.path.join(
+                    args.output_dir, "latent", self.train_dataset[idx]["latent_path"]
+                ),
+                map_location="cpu",
+            )
+        else:
+            latents = []
+        return dict(caption=caption, latents=latents, filename=filename)
+    def __len__(self):
+        return len(self.train_dataset)
+def main(args):
+    local_rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    print("world_size", world_size, "local rank", local_rank)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    torch.cuda.set_device(local_rank)
+    if not dist.is_initialized():
+        dist.init_process_group(
+            backend="nccl", init_method="env://", world_size=world_size, rank=local_rank
+        )
+    #videoprocessor = VideoProcessor(vae_scale_factor=8)
+    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "video"), exist_ok=True)
+    #os.makedirs(os.path.join(args.output_dir, "latent"), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "prompt_embed"), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "prompt_attention_mask"), exist_ok=True)
+    latents_txt_path = args.prompt_dir
+    train_dataset = T5dataset(latents_txt_path, args.vae_debug)
+    text_encoder = load_text_encoder(args.model_type, args.model_path, device=device)
+    #vae, autocast_type, fps = load_vae(args.model_type, args.model_path)
+    #vae.enable_tiling()
+    sampler = DistributedSampler(
+        train_dataset, rank=local_rank, num_replicas=world_size, shuffle=True
+    )
+    train_dataloader = DataLoader(
+        train_dataset,
+        sampler=sampler,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+    json_data = []
+    for _, data in tqdm(enumerate(train_dataloader), disable=local_rank != 0):
+        with torch.inference_mode():
+            with torch.autocast("cuda"):
+                prompt_embeds, prompt_attention_mask = text_encoder.encode_prompt(
+                    prompt=data["caption"],
+                )
+                if args.vae_debug:
+                    latents = data["latents"]
+                    #video = vae.decode(latents.to(device), return_dict=False)[0]
+                    #video = videoprocessor.postprocess_video(video)
+                for idx, video_name in enumerate(data["filename"]):
+                    prompt_embed_path = os.path.join(
+                        args.output_dir, "prompt_embed", video_name + ".pt"
+                    )
+                    #video_path = os.path.join(
+                    #    args.output_dir, "video", video_name + ".mp4"
+                    #)
+                    prompt_attention_mask_path = os.path.join(
+                        args.output_dir, "prompt_attention_mask", video_name + ".pt"
+                    )
+                    # save latent
+                    torch.save(prompt_embeds[idx], prompt_embed_path)
+                    torch.save(prompt_attention_mask[idx], prompt_attention_mask_path)
+                    #print(f"sample {video_name} saved")
+                    #if args.vae_debug:
+                     #   export_to_video(video[idx], video_path, fps=fps)
+                    item = {}
+                    #item["length"] = int(data["length"][idx])
+                    #item["latent_path"] = video_name + ".pt"
+                    item["prompt_embed_path"] = video_name + ".pt"
+                    item["prompt_attention_mask"] = video_name + ".pt"
+                    item["caption"] = data["caption"][idx]
+                    json_data.append(item)
+    dist.barrier()
+    local_data = json_data
+    gathered_data = [None] * world_size
+    dist.all_gather_object(gathered_data, local_data)
+    if local_rank == 0:
+        # os.remove(latents_json_path)
+        all_json_data = [item for sublist in gathered_data for item in sublist]
+        with open(os.path.join(args.output_dir, "videos2caption.json"), "w") as f:
+            json.dump(all_json_data, f, indent=4)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # dataset & dataloader
+    parser.add_argument("--model_path", type=str, default="data/mochi")
+    parser.add_argument("--model_type", type=str, default="mochi")
+    # text encoder & vae & diffusion model
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=1,
+        help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.",
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=1,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument("--text_encoder_name", type=str, default="google/t5-v1_1-xxl")
+    parser.add_argument("--cache_dir", type=str, default="./cache_dir")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--vae_debug", action="store_true")
+    parser.add_argument("--prompt_dir", type=str, default="./empty.txt")
+    args = parser.parse_args()
+    main(args)

fastvideo/data_preprocess/preprocess_text_embeddings.py ADDED Viewed

	@@ -0,0 +1,175 @@

+#This code file is from [https://github.com/hao-ai-lab/FastVideo], which is licensed under Apache License 2.0.
+import argparse
+import json
+import os
+import torch
+import torch.distributed as dist
+from accelerate.logging import get_logger
+from diffusers.utils import export_to_video
+from diffusers.video_processor import VideoProcessor
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm
+from fastvideo.utils.load import load_text_encoder, load_vae
+logger = get_logger(__name__)
+class T5dataset(Dataset):
+    def __init__(
+        self,
+        json_path,
+        vae_debug,
+    ):
+        self.json_path = json_path
+        self.vae_debug = vae_debug
+        with open(self.json_path, "r") as f:
+            train_dataset = json.load(f)
+            self.train_dataset = sorted(train_dataset,
+                                        key=lambda x: x["latent_path"])
+    def __getitem__(self, idx):
+        caption = self.train_dataset[idx]["caption"]
+        filename = self.train_dataset[idx]["latent_path"].split(".")[0]
+        length = self.train_dataset[idx]["length"]
+        if self.vae_debug:
+            latents = torch.load(
+                os.path.join(args.output_dir, "latent",
+                             self.train_dataset[idx]["latent_path"]),
+                map_location="cpu",
+            )
+        else:
+            latents = []
+        return dict(caption=caption,
+                    latents=latents,
+                    filename=filename,
+                    length=length)
+    def __len__(self):
+        return len(self.train_dataset)
+def main(args):
+    local_rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    print("world_size", world_size, "local rank", local_rank)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    torch.cuda.set_device(local_rank)
+    if not dist.is_initialized():
+        dist.init_process_group(backend="nccl",
+                                init_method="env://",
+                                world_size=world_size,
+                                rank=local_rank)
+    videoprocessor = VideoProcessor(vae_scale_factor=8)
+    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "video"), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "latent"), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "prompt_embed"), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "prompt_attention_mask"),
+                exist_ok=True)
+    latents_json_path = os.path.join(args.output_dir,
+                                     "videos2caption_temp.json")
+    train_dataset = T5dataset(latents_json_path, args.vae_debug)
+    text_encoder = load_text_encoder(args.model_type,
+                                     args.model_path,
+                                     device=device)
+    vae, autocast_type, fps = load_vae(args.model_type, args.model_path)
+    vae.enable_tiling()
+    sampler = DistributedSampler(train_dataset,
+                                 rank=local_rank,
+                                 num_replicas=world_size,
+                                 shuffle=True)
+    train_dataloader = DataLoader(
+        train_dataset,
+        sampler=sampler,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+    json_data = []
+    for _, data in tqdm(enumerate(train_dataloader), disable=local_rank != 0):
+        with torch.inference_mode():
+            with torch.autocast("cuda", dtype=autocast_type):
+                prompt_embeds, prompt_attention_mask = text_encoder.encode_prompt(
+                    prompt=data["caption"], )
+                if args.vae_debug:
+                    latents = data["latents"]
+                    video = vae.decode(latents.to(device),
+                                       return_dict=False)[0]
+                    video = videoprocessor.postprocess_video(video)
+                for idx, video_name in enumerate(data["filename"]):
+                    prompt_embed_path = os.path.join(args.output_dir,
+                                                     "prompt_embed",
+                                                     video_name + ".pt")
+                    video_path = os.path.join(args.output_dir, "video",
+                                              video_name + ".mp4")
+                    prompt_attention_mask_path = os.path.join(
+                        args.output_dir, "prompt_attention_mask",
+                        video_name + ".pt")
+                    # save latent
+                    torch.save(prompt_embeds[idx], prompt_embed_path)
+                    torch.save(prompt_attention_mask[idx],
+                               prompt_attention_mask_path)
+                    print(f"sample {video_name} saved")
+                    if args.vae_debug:
+                        export_to_video(video[idx], video_path, fps=fps)
+                    item = {}
+                    item["length"] = int(data["length"][idx])
+                    item["latent_path"] = video_name + ".pt"
+                    item["prompt_embed_path"] = video_name + ".pt"
+                    item["prompt_attention_mask"] = video_name + ".pt"
+                    item["caption"] = data["caption"][idx]
+                    json_data.append(item)
+    dist.barrier()
+    local_data = json_data
+    gathered_data = [None] * world_size
+    dist.all_gather_object(gathered_data, local_data)
+    if local_rank == 0:
+        # os.remove(latents_json_path)
+        all_json_data = [item for sublist in gathered_data for item in sublist]
+        with open(os.path.join(args.output_dir, "videos2caption.json"),
+                  "w") as f:
+            json.dump(all_json_data, f, indent=4)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # dataset & dataloader
+    parser.add_argument("--model_path", type=str, default="data/mochi")
+    parser.add_argument("--model_type", type=str, default="mochi")
+    # text encoder & vae & diffusion model
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=1,
+        help=
+        "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.",
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=1,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument("--text_encoder_name",
+                        type=str,
+                        default="google/t5-v1_1-xxl")
+    parser.add_argument("--cache_dir", type=str, default="./cache_dir")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help=
+        "The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--vae_debug", action="store_true")
+    args = parser.parse_args()
+    main(args)

fastvideo/data_preprocess/preprocess_vae_latents.py ADDED Viewed

	@@ -0,0 +1,137 @@

+#This code file is from [https://github.com/hao-ai-lab/FastVideo], which is licensed under Apache License 2.0.
+import argparse
+import json
+import os
+import torch
+import torch.distributed as dist
+from accelerate.logging import get_logger
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm
+from fastvideo.dataset import getdataset
+from fastvideo.utils.load import load_vae
+logger = get_logger(__name__)
+def main(args):
+    local_rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    print("world_size", world_size, "local rank", local_rank)
+    train_dataset = getdataset(args)
+    sampler = DistributedSampler(train_dataset,
+                                 rank=local_rank,
+                                 num_replicas=world_size,
+                                 shuffle=True)
+    train_dataloader = DataLoader(
+        train_dataset,
+        sampler=sampler,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+    encoder_device = torch.device(
+        "cuda" if torch.cuda.is_available() else "cpu")
+    torch.cuda.set_device(local_rank)
+    if not dist.is_initialized():
+        dist.init_process_group(backend="nccl",
+                                init_method="env://",
+                                world_size=world_size,
+                                rank=local_rank)
+    vae, autocast_type, fps = load_vae(args.model_type, args.model_path)
+    vae.enable_tiling()
+    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "latent"), exist_ok=True)
+    json_data = []
+    for _, data in tqdm(enumerate(train_dataloader), disable=local_rank != 0):
+        with torch.inference_mode():
+            with torch.autocast("cuda", dtype=autocast_type):
+                latents = vae.encode(data["pixel_values"].to(
+                    encoder_device))["latent_dist"].sample()
+            for idx, video_path in enumerate(data["path"]):
+                video_name = os.path.basename(video_path).split(".")[0]
+                latent_path = os.path.join(args.output_dir, "latent",
+                                           video_name + ".pt")
+                torch.save(latents[idx].to(torch.bfloat16), latent_path)
+                item = {}
+                item["length"] = latents[idx].shape[1]
+                item["latent_path"] = video_name + ".pt"
+                item["caption"] = data["text"][idx]
+                json_data.append(item)
+                print(f"{video_name} processed")
+    dist.barrier()
+    local_data = json_data
+    gathered_data = [None] * world_size
+    dist.all_gather_object(gathered_data, local_data)
+    if local_rank == 0:
+        all_json_data = [item for sublist in gathered_data for item in sublist]
+        with open(os.path.join(args.output_dir, "videos2caption_temp.json"),
+                  "w") as f:
+            json.dump(all_json_data, f, indent=4)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # dataset & dataloader
+    parser.add_argument("--model_path", type=str, default="data/mochi")
+    parser.add_argument("--model_type", type=str, default="mochi")
+    parser.add_argument("--data_merge_path", type=str, required=True)
+    parser.add_argument("--num_frames", type=int, default=163)
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=1,
+        help=
+        "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.",
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=16,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument("--num_latent_t",
+                        type=int,
+                        default=28,
+                        help="Number of latent timesteps.")
+    parser.add_argument("--max_height", type=int, default=480)
+    parser.add_argument("--max_width", type=int, default=848)
+    parser.add_argument("--video_length_tolerance_range",
+                        type=int,
+                        default=2.0)
+    parser.add_argument("--group_frame", action="store_true")  # TODO
+    parser.add_argument("--group_resolution", action="store_true")  # TODO
+    parser.add_argument("--dataset", default="t2v")
+    parser.add_argument("--train_fps", type=int, default=30)
+    parser.add_argument("--use_image_num", type=int, default=0)
+    parser.add_argument("--text_max_length", type=int, default=256)
+    parser.add_argument("--speed_factor", type=float, default=1.0)
+    parser.add_argument("--drop_short_ratio", type=float, default=1.0)
+    # text encoder & vae & diffusion model
+    parser.add_argument("--text_encoder_name",
+                        type=str,
+                        default="google/t5-v1_1-xxl")
+    parser.add_argument("--cache_dir", type=str, default="./cache_dir")
+    parser.add_argument("--cfg", type=float, default=0.0)
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help=
+        "The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=
+        ("[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+         " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."),
+    )
+    args = parser.parse_args()
+    main(args)

fastvideo/data_preprocess/preprocess_validation_text_embeddings.py ADDED Viewed

	@@ -0,0 +1,80 @@

+#This code file is from [https://github.com/hao-ai-lab/FastVideo], which is licensed under Apache License 2.0.
+import argparse
+import os
+import torch
+import torch.distributed as dist
+from accelerate.logging import get_logger
+from fastvideo.utils.load import load_text_encoder
+logger = get_logger(__name__)
+def main(args):
+    local_rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    print("world_size", world_size, "local rank", local_rank)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    torch.cuda.set_device(local_rank)
+    if not dist.is_initialized():
+        dist.init_process_group(backend="nccl",
+                                init_method="env://",
+                                world_size=world_size,
+                                rank=local_rank)
+    text_encoder = load_text_encoder(args.model_type,
+                                     args.model_path,
+                                     device=device)
+    autocast_type = torch.float16 if args.model_type == "hunyuan" else torch.bfloat16
+    # output_dir/validation/prompt_attention_mask
+    # output_dir/validation/prompt_embed
+    os.makedirs(os.path.join(args.output_dir, "validation"), exist_ok=True)
+    os.makedirs(
+        os.path.join(args.output_dir, "validation", "prompt_attention_mask"),
+        exist_ok=True,
+    )
+    os.makedirs(os.path.join(args.output_dir, "validation", "prompt_embed"),
+                exist_ok=True)
+    with open(args.validation_prompt_txt, "r", encoding="utf-8") as file:
+        lines = file.readlines()
+    prompts = [line.strip() for line in lines]
+    for prompt in prompts:
+        with torch.inference_mode():
+            with torch.autocast("cuda", dtype=autocast_type):
+                prompt_embeds, prompt_attention_mask = text_encoder.encode_prompt(
+                    prompt)
+                file_name = prompt.split(".")[0]
+                prompt_embed_path = os.path.join(args.output_dir, "validation",
+                                                 "prompt_embed",
+                                                 f"{file_name}.pt")
+                prompt_attention_mask_path = os.path.join(
+                    args.output_dir,
+                    "validation",
+                    "prompt_attention_mask",
+                    f"{file_name}.pt",
+                )
+                torch.save(prompt_embeds[0], prompt_embed_path)
+                torch.save(prompt_attention_mask[0],
+                           prompt_attention_mask_path)
+                print(f"sample {file_name} saved")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # dataset & dataloader
+    parser.add_argument("--model_path", type=str, default="data/mochi")
+    parser.add_argument("--model_type", type=str, default="mochi")
+    parser.add_argument("--validation_prompt_txt", type=str)
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help=
+        "The output directory where the model predictions and checkpoints will be written.",
+    )
+    args = parser.parse_args()
+    main(args)

fastvideo/dataset/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

fastvideo/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from torchvision import transforms
+from torchvision.transforms import Lambda
+from transformers import AutoTokenizer
+from fastvideo.dataset.t2v_datasets import T2V_dataset
+from fastvideo.dataset.transform import (CenterCropResizeVideo, Normalize255,
+                                         TemporalRandomCrop)
+def getdataset(args):
+    temporal_sample = TemporalRandomCrop(args.num_frames)  # 16 x
+    norm_fun = Lambda(lambda x: 2.0 * x - 1.0)
+    resize_topcrop = [
+        CenterCropResizeVideo((args.max_height, args.max_width),
+                              top_crop=True),
+    ]
+    resize = [
+        CenterCropResizeVideo((args.max_height, args.max_width)),
+    ]
+    transform = transforms.Compose([
+        # Normalize255(),
+        *resize,
+    ])
+    transform_topcrop = transforms.Compose([
+        Normalize255(),
+        *resize_topcrop,
+        norm_fun,
+    ])
+    # tokenizer = AutoTokenizer.from_pretrained("/storage/ongoing/new/Open-Sora-Plan/cache_dir/mt5-xxl", cache_dir=args.cache_dir)
+    tokenizer = AutoTokenizer.from_pretrained(args.text_encoder_name,
+                                              cache_dir=args.cache_dir)
+    if args.dataset == "t2v":
+        return T2V_dataset(
+            args,
+            transform=transform,
+            temporal_sample=temporal_sample,
+            tokenizer=tokenizer,
+            transform_topcrop=transform_topcrop,
+        )
+    raise NotImplementedError(args.dataset)
+if __name__ == "__main__":
+    import random
+    from accelerate import Accelerator
+    from tqdm import tqdm
+    from fastvideo.dataset.t2v_datasets import dataset_prog
+    args = type(
+        "args",
+        (),
+        {
+            "ae": "CausalVAEModel_4x8x8",
+            "dataset": "t2v",
+            "attention_mode": "xformers",
+            "use_rope": True,
+            "text_max_length": 300,
+            "max_height": 320,
+            "max_width": 240,
+            "num_frames": 1,
+            "use_image_num": 0,
+            "interpolation_scale_t": 1,
+            "interpolation_scale_h": 1,
+            "interpolation_scale_w": 1,
+            "cache_dir": "../cache_dir",
+            "image_data":
+            "/storage/ongoing/new/Open-Sora-Plan-bak/7.14bak/scripts/train_data/image_data.txt",
+            "video_data": "1",
+            "train_fps": 24,
+            "drop_short_ratio": 1.0,
+            "use_img_from_vid": False,
+            "speed_factor": 1.0,
+            "cfg": 0.1,
+            "text_encoder_name": "google/mt5-xxl",
+            "dataloader_num_workers": 10,
+        },
+    )
+    accelerator = Accelerator()
+    dataset = getdataset(args)
+    num = len(dataset_prog.img_cap_list)
+    zero = 0
+    for idx in tqdm(range(num)):
+        image_data = dataset_prog.img_cap_list[idx]
+        caps = [
+            i["cap"] if isinstance(i["cap"], list) else [i["cap"]]
+            for i in image_data
+        ]
+        try:
+            caps = [[random.choice(i)] for i in caps]
+        except Exception as e:
+            print(e)
+            # import ipdb;ipdb.set_trace()
+            print(image_data)
+            zero += 1
+            continue
+        assert caps[0] is not None and len(caps[0]) > 0
+    print(num, zero)
+    import ipdb
+    ipdb.set_trace()
+    print("end")

fastvideo/dataset/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (2.78 kB). View file

fastvideo/dataset/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (3.94 kB). View file

fastvideo/dataset/__pycache__/latent_flux_rfpt_datasets.cpython-312.pyc ADDED Viewed

Binary file (5.12 kB). View file

fastvideo/dataset/__pycache__/latent_flux_rfpt_datasets_all.cpython-312.pyc ADDED Viewed

Binary file (5.56 kB). View file

fastvideo/dataset/__pycache__/latent_flux_rl_datasets.cpython-312.pyc ADDED Viewed

Binary file (4.67 kB). View file

fastvideo/dataset/__pycache__/latent_qwenimage_rl_datasets.cpython-310.pyc ADDED Viewed

Binary file (2.24 kB). View file

fastvideo/dataset/__pycache__/t2v_datasets.cpython-310.pyc ADDED Viewed

Binary file (9.14 kB). View file

fastvideo/dataset/__pycache__/t2v_datasets.cpython-312.pyc ADDED Viewed

Binary file (16 kB). View file

fastvideo/dataset/__pycache__/transform.cpython-310.pyc ADDED Viewed

Binary file (18.3 kB). View file

fastvideo/dataset/__pycache__/transform.cpython-312.pyc ADDED Viewed

Binary file (27.3 kB). View file

fastvideo/dataset/latent_datasets.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#This code file is from [https://github.com/hao-ai-lab/FastVideo], which is licensed under Apache License 2.0.
+import json
+import os
+import random
+import torch
+from torch.utils.data import Dataset
+class LatentDataset(Dataset):
+    def __init__(
+        self,
+        json_path,
+        num_latent_t,
+        cfg_rate,
+    ):
+        # data_merge_path: video_dir, latent_dir, prompt_embed_dir, json_path
+        self.json_path = json_path
+        self.cfg_rate = cfg_rate
+        self.datase_dir_path = os.path.dirname(json_path)
+        self.video_dir = os.path.join(self.datase_dir_path, "video")
+        self.latent_dir = os.path.join(self.datase_dir_path, "latent")
+        self.prompt_embed_dir = os.path.join(self.datase_dir_path,
+                                             "prompt_embed")
+        self.prompt_attention_mask_dir = os.path.join(self.datase_dir_path,
+                                                      "prompt_attention_mask")
+        with open(self.json_path, "r") as f:
+            self.data_anno = json.load(f)
+        # json.load(f) already keeps the order
+        # self.data_anno = sorted(self.data_anno, key=lambda x: x['latent_path'])
+        self.num_latent_t = num_latent_t
+        # just zero embeddings [256, 4096]
+        self.uncond_prompt_embed = torch.zeros(256, 4096).to(torch.float32)
+        # 256 zeros
+        self.uncond_prompt_mask = torch.zeros(256).bool()
+        self.lengths = [
+            data_item["length"] if "length" in data_item else 1
+            for data_item in self.data_anno
+        ]
+    def __getitem__(self, idx):
+        latent_file = self.data_anno[idx]["latent_path"]
+        prompt_embed_file = self.data_anno[idx]["prompt_embed_path"]
+        prompt_attention_mask_file = self.data_anno[idx][
+            "prompt_attention_mask"]
+        # load
+        latent = torch.load(
+            os.path.join(self.latent_dir, latent_file),
+            map_location="cpu",
+            weights_only=True,
+        )
+        latent = latent.squeeze(0)[:, -self.num_latent_t:]
+        if random.random() < self.cfg_rate:
+            prompt_embed = self.uncond_prompt_embed
+            prompt_attention_mask = self.uncond_prompt_mask
+        else:
+            prompt_embed = torch.load(
+                os.path.join(self.prompt_embed_dir, prompt_embed_file),
+                map_location="cpu",
+                weights_only=True,
+            )
+            prompt_attention_mask = torch.load(
+                os.path.join(self.prompt_attention_mask_dir,
+                             prompt_attention_mask_file),
+                map_location="cpu",
+                weights_only=True,
+            )
+        return latent, prompt_embed, prompt_attention_mask
+    def __len__(self):
+        return len(self.data_anno)
+def latent_collate_function(batch):
+    # return latent, prompt, latent_attn_mask, text_attn_mask
+    # latent_attn_mask: # b t h w
+    # text_attn_mask: b 1 l
+    # needs to check if the latent/prompt' size and apply padding & attn mask
+    latents, prompt_embeds, prompt_attention_masks = zip(*batch)
+    # calculate max shape
+    max_t = max([latent.shape[1] for latent in latents])
+    max_h = max([latent.shape[2] for latent in latents])
+    max_w = max([latent.shape[3] for latent in latents])
+    # padding
+    latents = [
+        torch.nn.functional.pad(
+            latent,
+            (
+                0,
+                max_t - latent.shape[1],
+                0,
+                max_h - latent.shape[2],
+                0,
+                max_w - latent.shape[3],
+            ),
+        ) for latent in latents
+    ]
+    # attn mask
+    latent_attn_mask = torch.ones(len(latents), max_t, max_h, max_w)
+    # set to 0 if padding
+    for i, latent in enumerate(latents):
+        latent_attn_mask[i, latent.shape[1]:, :, :] = 0
+        latent_attn_mask[i, :, latent.shape[2]:, :] = 0
+        latent_attn_mask[i, :, :, latent.shape[3]:] = 0
+    prompt_embeds = torch.stack(prompt_embeds, dim=0)
+    prompt_attention_masks = torch.stack(prompt_attention_masks, dim=0)
+    latents = torch.stack(latents, dim=0)
+    return latents, prompt_embeds, latent_attn_mask, prompt_attention_masks
+if __name__ == "__main__":
+    dataset = LatentDataset("data/Mochi-Synthetic-Data/merge.txt",
+                            num_latent_t=28)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=2,
+        shuffle=False,
+        collate_fn=latent_collate_function)
+    for latent, prompt_embed, latent_attn_mask, prompt_attention_mask in dataloader:
+        print(
+            latent.shape,
+            prompt_embed.shape,
+            latent_attn_mask.shape,
+            prompt_attention_mask.shape,
+        )
+        import pdb
+        pdb.set_trace()

fastvideo/dataset/latent_flux_rfpt_datasets.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) [2025] [FastVideo Team]
+# Copyright (c) [2025] [ByteDance Ltd. and/or its affiliates.]
+# SPDX-License-Identifier: [Apache License 2.0]
+#
+# This file has been modified by [ByteDance Ltd. and/or its affiliates.] in 2025.
+#
+# Original file was released under [Apache License 2.0], with the full license text
+# available at [https://github.com/hao-ai-lab/FastVideo/blob/main/LICENSE].
+#
+# This modified file is released under the same license.
+import torch
+from torch.utils.data import Dataset
+import json
+import os
+import random
+class LatentDataset(Dataset):
+    def __init__(
+        self, json_path, num_latent_t, cfg_rate,
+    ):
+        # data_merge_path: video_dir, latent_dir, prompt_embed_dir, json_path
+        self.json_path = json_path
+        self.cfg_rate = cfg_rate
+        self.datase_dir_path = os.path.dirname(json_path)
+        #self.video_dir = os.path.join(self.datase_dir_path, "video")
+        #self.latent_dir = os.path.join(self.datase_dir_path, "latent")
+        self.prompt_embed_dir = os.path.join(self.datase_dir_path, "prompt_embed")
+        self.pooled_prompt_embeds_dir = os.path.join(
+            self.datase_dir_path, "pooled_prompt_embeds"
+        )
+        self.text_ids_dir = os.path.join(
+            self.datase_dir_path, "text_ids"
+        )
+        self.latents_dir = os.path.join(
+            self.datase_dir_path, "images"
+        )
+        with open(self.json_path, "r") as f:
+            self.data_anno = json.load(f)
+        # json.load(f) already keeps the order
+        # self.data_anno = sorted(self.data_anno, key=lambda x: x['latent_path'])
+        self.num_latent_t = num_latent_t
+        # just zero embeddings [256, 4096]
+        self.uncond_prompt_embed = torch.zeros(256, 4096).to(torch.float32)
+        # 256 zeros
+        self.uncond_prompt_mask = torch.zeros(256).bool()
+        self.lengths = [
+            data_item["length"] if "length" in data_item else 1
+            for data_item in self.data_anno
+        ]
+    def __getitem__(self, idx):
+        #latent_file = self.data_anno[idx]["latent_path"]
+        prompt_embed_file = self.data_anno[idx]["prompt_embed_path"]
+        pooled_prompt_embeds_file = self.data_anno[idx]["pooled_prompt_embeds_path"]
+        text_ids_file = self.data_anno[idx]["text_ids"]
+        latent_file = text_ids_file
+        if random.random() < self.cfg_rate:
+            prompt_embed = self.uncond_prompt_embed
+        else:
+            prompt_embed = torch.load(
+                os.path.join(self.prompt_embed_dir, prompt_embed_file),
+                map_location="cpu",
+                weights_only=True,
+            )
+            pooled_prompt_embeds = torch.load(
+                os.path.join(
+                    self.pooled_prompt_embeds_dir, pooled_prompt_embeds_file
+                ),
+                map_location="cpu",
+                weights_only=True,
+            )
+            text_ids = torch.load(
+                os.path.join(
+                    self.text_ids_dir, text_ids_file
+                ),
+                map_location="cpu",
+                weights_only=True,
+            )
+            latents = torch.load(
+                os.path.join(
+                    self.latents_dir, latent_file
+                ),
+                map_location="cpu",
+                weights_only=True,
+            )
+        return prompt_embed, pooled_prompt_embeds, text_ids, self.data_anno[idx]['caption'], latents
+    def __len__(self):
+        return len(self.data_anno)
+def latent_collate_function(batch):
+    # return latent, prompt, latent_attn_mask, text_attn_mask
+    # latent_attn_mask: # b t h w
+    # text_attn_mask: b 1 l
+    # needs to check if the latent/prompt' size and apply padding & attn mask
+    prompt_embeds, pooled_prompt_embeds, text_ids, caption, latents = zip(*batch)
+    # attn mask
+    prompt_embeds = torch.stack(prompt_embeds, dim=0)
+    pooled_prompt_embeds = torch.stack(pooled_prompt_embeds, dim=0)
+    text_ids = torch.stack(text_ids, dim=0)
+    latents= torch.stack(latents, dim=0)
+    #latents = torch.stack(latents, dim=0)
+    return prompt_embeds, pooled_prompt_embeds, text_ids, caption, latents
+if __name__ == "__main__":
+    dataset = LatentDataset("data/rl_embeddings/videos2caption.json", num_latent_t=28, cfg_rate=0.0)
+    dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=2, shuffle=False, collate_fn=latent_collate_function
+    )
+    for prompt_embed, prompt_attention_mask, caption in dataloader:
+        print(
+            prompt_embed.shape,
+            prompt_attention_mask.shape,
+            caption
+        )
+        import pdb
+        pdb.set_trace()

fastvideo/dataset/latent_flux_rfpt_datasets_all.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# Copyright (c) [2025] [FastVideo Team]
+# Copyright (c) [2025] [ByteDance Ltd. and/or its affiliates.]
+# SPDX-License-Identifier: [Apache License 2.0]
+#
+# This file has been modified by [ByteDance Ltd. and/or its affiliates.] in 2025.
+#
+# Original file was released under [Apache License 2.0], with the full license text
+# available at [https://github.com/hao-ai-lab/FastVideo/blob/main/LICENSE].
+#
+# This modified file is released under the same license.
+import torch
+from torch.utils.data import Dataset
+import json
+import os
+import random
+class LatentDataset(Dataset):
+    def __init__(
+        self, json_path, num_latent_t, cfg_rate,
+    ):
+        # data_merge_path: video_dir, latent_dir, prompt_embed_dir, json_path
+        self.json_path = json_path
+        self.cfg_rate = cfg_rate
+        self.datase_dir_path = os.path.dirname(json_path)
+        #self.video_dir = os.path.join(self.datase_dir_path, "video")
+        #self.latent_dir = os.path.join(self.datase_dir_path, "latent")
+        self.prompt_embed_dir = os.path.join(self.datase_dir_path, "prompt_embed")
+        self.pooled_prompt_embeds_dir = os.path.join(
+            self.datase_dir_path, "pooled_prompt_embeds"
+        )
+        self.text_ids_dir = os.path.join(
+            self.datase_dir_path, "text_ids"
+        )
+        self.images_dir = os.path.join(
+            self.datase_dir_path, "images"
+        )
+        self.latents_dir = os.path.join(
+            self.datase_dir_path, "latents"
+        )
+        with open(self.json_path, "r") as f:
+            self.data_anno = json.load(f)
+        # json.load(f) already keeps the order
+        # self.data_anno = sorted(self.data_anno, key=lambda x: x['latent_path'])
+        self.num_latent_t = num_latent_t
+        # just zero embeddings [256, 4096]
+        self.uncond_prompt_embed = torch.zeros(256, 4096).to(torch.float32)
+        # 256 zeros
+        self.uncond_prompt_mask = torch.zeros(256).bool()
+        self.lengths = [
+            data_item["length"] if "length" in data_item else 1
+            for data_item in self.data_anno
+        ]
+    def __getitem__(self, idx):
+        #latent_file = self.data_anno[idx]["latent_path"]
+        prompt_embed_file = self.data_anno[idx]["prompt_embed_path"]
+        pooled_prompt_embeds_file = self.data_anno[idx]["pooled_prompt_embeds_path"]
+        text_ids_file = self.data_anno[idx]["text_ids"]
+        latent_file = text_ids_file
+        image_file = text_ids_file
+        if random.random() < self.cfg_rate:
+            prompt_embed = self.uncond_prompt_embed
+        else:
+            prompt_embed = torch.load(
+                os.path.join(self.prompt_embed_dir, prompt_embed_file),
+                map_location="cpu",
+                weights_only=True,
+            )
+            pooled_prompt_embeds = torch.load(
+                os.path.join(
+                    self.pooled_prompt_embeds_dir, pooled_prompt_embeds_file
+                ),
+                map_location="cpu",
+                weights_only=True,
+            )
+            text_ids = torch.load(
+                os.path.join(
+                    self.text_ids_dir, text_ids_file
+                ),
+                map_location="cpu",
+                weights_only=True,
+            )
+            latents = torch.load(
+                os.path.join(
+                    self.latents_dir, latent_file
+                ),
+                map_location="cpu",
+                weights_only=True,
+            )
+            images = torch.load(
+                os.path.join(
+                    self.images_dir, image_file
+                ),
+                map_location="cpu",
+                weights_only=True,
+            )
+        return prompt_embed, pooled_prompt_embeds, text_ids, self.data_anno[idx]['caption'], latents, images
+    def __len__(self):
+        return len(self.data_anno)
+def latent_collate_function(batch):
+    # return latent, prompt, latent_attn_mask, text_attn_mask
+    # latent_attn_mask: # b t h w
+    # text_attn_mask: b 1 l
+    # needs to check if the latent/prompt' size and apply padding & attn mask
+    prompt_embeds, pooled_prompt_embeds, text_ids, caption, latents, images = zip(*batch)
+    # attn mask
+    prompt_embeds = torch.stack(prompt_embeds, dim=0)
+    pooled_prompt_embeds = torch.stack(pooled_prompt_embeds, dim=0)
+    text_ids = torch.stack(text_ids, dim=0)
+    latents= torch.stack(latents, dim=0)
+    images= torch.stack(images, dim=0)
+    #latents = torch.stack(latents, dim=0)
+    return prompt_embeds, pooled_prompt_embeds, text_ids, caption, latents, images
+if __name__ == "__main__":
+    dataset = LatentDataset("data/rl_embeddings/videos2caption.json", num_latent_t=28, cfg_rate=0.0)
+    dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=2, shuffle=False, collate_fn=latent_collate_function
+    )
+    for prompt_embed, prompt_attention_mask, caption in dataloader:
+        print(
+            prompt_embed.shape,
+            prompt_attention_mask.shape,
+            caption
+        )
+        import pdb
+        pdb.set_trace()

fastvideo/dataset/latent_flux_rl_datasets.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) [2025] [FastVideo Team]
+# Copyright (c) [2025] [ByteDance Ltd. and/or its affiliates.]
+# SPDX-License-Identifier: [Apache License 2.0]
+#
+# This file has been modified by [ByteDance Ltd. and/or its affiliates.] in 2025.
+#
+# Original file was released under [Apache License 2.0], with the full license text
+# available at [https://github.com/hao-ai-lab/FastVideo/blob/main/LICENSE].
+#
+# This modified file is released under the same license.
+import torch
+from torch.utils.data import Dataset
+import json
+import os
+import random
+class LatentDataset(Dataset):
+    def __init__(
+        self, json_path, num_latent_t, cfg_rate,
+    ):
+        # data_merge_path: video_dir, latent_dir, prompt_embed_dir, json_path
+        self.json_path = json_path
+        self.cfg_rate = cfg_rate
+        self.datase_dir_path = os.path.dirname(json_path)
+        #self.video_dir = os.path.join(self.datase_dir_path, "video")
+        #self.latent_dir = os.path.join(self.datase_dir_path, "latent")
+        self.prompt_embed_dir = os.path.join(self.datase_dir_path, "prompt_embed")
+        self.pooled_prompt_embeds_dir = os.path.join(
+            self.datase_dir_path, "pooled_prompt_embeds"
+        )
+        self.text_ids_dir = os.path.join(
+            self.datase_dir_path, "text_ids"
+        )
+        with open(self.json_path, "r") as f:
+            self.data_anno = json.load(f)
+        # json.load(f) already keeps the order
+        # self.data_anno = sorted(self.data_anno, key=lambda x: x['latent_path'])
+        self.num_latent_t = num_latent_t
+        # just zero embeddings [256, 4096]
+        self.uncond_prompt_embed = torch.zeros(256, 4096).to(torch.float32)
+        # 256 zeros
+        self.uncond_prompt_mask = torch.zeros(256).bool()
+        self.lengths = [
+            data_item["length"] if "length" in data_item else 1
+            for data_item in self.data_anno
+        ]
+    def __getitem__(self, idx):
+        #latent_file = self.data_anno[idx]["latent_path"]
+        prompt_embed_file = self.data_anno[idx]["prompt_embed_path"]
+        pooled_prompt_embeds_file = self.data_anno[idx]["pooled_prompt_embeds_path"]
+        text_ids_file = self.data_anno[idx]["text_ids"]
+        if random.random() < self.cfg_rate:
+            prompt_embed = self.uncond_prompt_embed
+        else:
+            prompt_embed = torch.load(
+                os.path.join(self.prompt_embed_dir, prompt_embed_file),
+                map_location="cpu",
+                weights_only=True,
+            )
+            pooled_prompt_embeds = torch.load(
+                os.path.join(
+                    self.pooled_prompt_embeds_dir, pooled_prompt_embeds_file
+                ),
+                map_location="cpu",
+                weights_only=True,
+            )
+            text_ids = torch.load(
+                os.path.join(
+                    self.text_ids_dir, text_ids_file
+                ),
+                map_location="cpu",
+                weights_only=True,
+            )
+        return prompt_embed, pooled_prompt_embeds, text_ids, self.data_anno[idx]['caption']
+    def __len__(self):
+        return len(self.data_anno)
+def latent_collate_function(batch):
+    # return latent, prompt, latent_attn_mask, text_attn_mask
+    # latent_attn_mask: # b t h w
+    # text_attn_mask: b 1 l
+    # needs to check if the latent/prompt' size and apply padding & attn mask
+    prompt_embeds, pooled_prompt_embeds, text_ids, caption = zip(*batch)
+    # attn mask
+    prompt_embeds = torch.stack(prompt_embeds, dim=0)
+    pooled_prompt_embeds = torch.stack(pooled_prompt_embeds, dim=0)
+    text_ids = torch.stack(text_ids, dim=0)
+    #latents = torch.stack(latents, dim=0)
+    return prompt_embeds, pooled_prompt_embeds, text_ids, caption
+if __name__ == "__main__":
+    dataset = LatentDataset("data/rl_embeddings/videos2caption.json", num_latent_t=28, cfg_rate=0.0)
+    dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=2, shuffle=False, collate_fn=latent_collate_function
+    )
+    for prompt_embed, prompt_attention_mask, caption in dataloader:
+        print(
+            prompt_embed.shape,
+            prompt_attention_mask.shape,
+            caption
+        )
+        import pdb
+        pdb.set_trace()

fastvideo/dataset/latent_qwenimage_rl_datasets.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) [2025] [FastVideo Team]
+# Copyright (c) [2025] [ByteDance Ltd. and/or its affiliates.]
+# SPDX-License-Identifier: [Apache License 2.0]
+#
+# This file has been modified by [ByteDance Ltd. and/or its affiliates.] in 2025.
+#
+# Original file was released under [Apache License 2.0], with the full license text
+# available at [https://github.com/hao-ai-lab/FastVideo/blob/main/LICENSE].
+#
+# This modified file is released under the same license.
+import torch
+from torch.utils.data import Dataset
+import json
+import os
+import random
+class LatentDataset(Dataset):
+    def __init__(
+        self, json_path, num_latent_t, cfg_rate,
+    ):
+        # data_merge_path: video_dir, latent_dir, prompt_embed_dir, json_path
+        self.json_path = json_path
+        self.cfg_rate = cfg_rate
+        self.datase_dir_path = os.path.dirname(json_path)
+        #self.video_dir = os.path.join(self.datase_dir_path, "video")
+        #self.latent_dir = os.path.join(self.datase_dir_path, "latent")
+        self.prompt_embed_dir = os.path.join(self.datase_dir_path, "prompt_embed")
+        self.prompt_attention_mask_dir = os.path.join(
+            self.datase_dir_path, "prompt_attention_mask"
+        )
+        with open(self.json_path, "r") as f:
+            self.data_anno = json.load(f)
+        # json.load(f) already keeps the order
+        # self.data_anno = sorted(self.data_anno, key=lambda x: x['latent_path'])
+        self.num_latent_t = num_latent_t
+        # just zero embeddings [256, 4096]
+        self.uncond_prompt_embed = torch.zeros(256, 4096).to(torch.float32)
+        # 256 zeros
+        self.uncond_prompt_mask = torch.zeros(256).bool()
+        self.lengths = [
+            data_item["length"] if "length" in data_item else 1
+            for data_item in self.data_anno
+        ]
+    def __getitem__(self, idx):
+        #latent_file = self.data_anno[idx]["latent_path"]
+        prompt_embed_file = self.data_anno[idx]["prompt_embed_path"]
+        prompt_attention_mask_file = self.data_anno[idx]["prompt_attention_mask"]
+        if random.random() < self.cfg_rate:
+            prompt_embed = self.uncond_prompt_embed
+            prompt_attention_mask = self.uncond_prompt_mask
+        else:
+            prompt_embed = torch.load(
+                os.path.join(self.prompt_embed_dir, prompt_embed_file),
+                map_location="cpu",
+                weights_only=True,
+            )
+            prompt_attention_mask = torch.load(
+                os.path.join(
+                    self.prompt_attention_mask_dir, prompt_attention_mask_file
+                ),
+                map_location="cpu",
+                weights_only=True,
+            )
+        return prompt_embed, prompt_attention_mask, self.data_anno[idx]['caption'], self.data_anno[idx]['original_length']
+    def __len__(self):
+        return len(self.data_anno)
+def latent_collate_function(batch):
+    # return latent, prompt, latent_attn_mask, text_attn_mask
+    # latent_attn_mask: # b t h w
+    # text_attn_mask: b 1 l
+    # needs to check if the latent/prompt' size and apply padding & attn mask
+    prompt_embeds, prompt_attention_masks, caption, original_length = zip(*batch)
+    # attn mask
+    prompt_embeds = torch.stack(prompt_embeds, dim=0)
+    prompt_attention_masks = torch.stack(prompt_attention_masks, dim=0)
+    # Convert original_length to tensor
+    original_length = torch.tensor(original_length, dtype=torch.long)
+    # Convert caption to list
+    caption = list(caption)
+    #latents = torch.stack(latents, dim=0)
+    return prompt_embeds, prompt_attention_masks, caption, original_length

fastvideo/dataset/latent_rl_datasets.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (c) [2025] [FastVideo Team]
+# Copyright (c) [2025] [ByteDance Ltd. and/or its affiliates.]
+# SPDX-License-Identifier: [Apache License 2.0]
+#
+# This file has been modified by [ByteDance Ltd. and/or its affiliates.] in 2025.
+#
+# Original file was released under [Apache License 2.0], with the full license text
+# available at [https://github.com/hao-ai-lab/FastVideo/blob/main/LICENSE].
+#
+# This modified file is released under the same license.
+import torch
+from torch.utils.data import Dataset
+import json
+import os
+import random
+class LatentDataset(Dataset):
+    def __init__(
+        self, json_path, num_latent_t, cfg_rate,
+    ):
+        # data_merge_path: video_dir, latent_dir, prompt_embed_dir, json_path
+        self.json_path = json_path
+        self.cfg_rate = cfg_rate
+        self.datase_dir_path = os.path.dirname(json_path)
+        #self.video_dir = os.path.join(self.datase_dir_path, "video")
+        #self.latent_dir = os.path.join(self.datase_dir_path, "latent")
+        self.prompt_embed_dir = os.path.join(self.datase_dir_path, "prompt_embed")
+        self.prompt_attention_mask_dir = os.path.join(
+            self.datase_dir_path, "prompt_attention_mask"
+        )
+        with open(self.json_path, "r") as f:
+            self.data_anno = json.load(f)
+        # json.load(f) already keeps the order
+        # self.data_anno = sorted(self.data_anno, key=lambda x: x['latent_path'])
+        self.num_latent_t = num_latent_t
+        # just zero embeddings [256, 4096]
+        self.uncond_prompt_embed = torch.zeros(256, 4096).to(torch.float32)
+        # 256 zeros
+        self.uncond_prompt_mask = torch.zeros(256).bool()
+        self.lengths = [
+            data_item["length"] if "length" in data_item else 1
+            for data_item in self.data_anno
+        ]
+    def __getitem__(self, idx):
+        #latent_file = self.data_anno[idx]["latent_path"]
+        prompt_embed_file = self.data_anno[idx]["prompt_embed_path"]
+        prompt_attention_mask_file = self.data_anno[idx]["prompt_attention_mask"]
+        if random.random() < self.cfg_rate:
+            prompt_embed = self.uncond_prompt_embed
+            prompt_attention_mask = self.uncond_prompt_mask
+        else:
+            prompt_embed = torch.load(
+                os.path.join(self.prompt_embed_dir, prompt_embed_file),
+                map_location="cpu",
+                weights_only=True,
+            )
+            prompt_attention_mask = torch.load(
+                os.path.join(
+                    self.prompt_attention_mask_dir, prompt_attention_mask_file
+                ),
+                map_location="cpu",
+                weights_only=True,
+            )
+        return prompt_embed, prompt_attention_mask, self.data_anno[idx]['caption']
+    def __len__(self):
+        return len(self.data_anno)
+def latent_collate_function(batch):
+    # return latent, prompt, latent_attn_mask, text_attn_mask
+    # latent_attn_mask: # b t h w
+    # text_attn_mask: b 1 l
+    # needs to check if the latent/prompt' size and apply padding & attn mask
+    prompt_embeds, prompt_attention_masks, caption = zip(*batch)
+    # attn mask
+    prompt_embeds = torch.stack(prompt_embeds, dim=0)
+    prompt_attention_masks = torch.stack(prompt_attention_masks, dim=0)
+    #latents = torch.stack(latents, dim=0)
+    return prompt_embeds, prompt_attention_masks, caption
+if __name__ == "__main__":
+    dataset = LatentDataset("data/rl_embeddings/videos2caption.json", num_latent_t=28, cfg_rate=0.0)
+    dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=2, shuffle=False, collate_fn=latent_collate_function
+    )
+    for prompt_embed, prompt_attention_mask, caption in dataloader:
+        print(
+            prompt_embed.shape,
+            prompt_attention_mask.shape,
+            caption
+        )
+        import pdb
+        pdb.set_trace()

fastvideo/dataset/t2v_datasets.py ADDED Viewed

	@@ -0,0 +1,351 @@

+#This code file is from [https://github.com/hao-ai-lab/FastVideo], which is licensed under Apache License 2.0.
+import json
+import math
+import os
+import random
+from collections import Counter
+from os.path import join as opj
+import numpy as np
+import torch
+import torchvision
+from einops import rearrange
+from PIL import Image
+from torch.utils.data import Dataset
+from fastvideo.utils.dataset_utils import DecordInit
+from fastvideo.utils.logging_ import main_print
+class SingletonMeta(type):
+    _instances = {}
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            instance = super().__call__(*args, **kwargs)
+            cls._instances[cls] = instance
+        return cls._instances[cls]
+class DataSetProg(metaclass=SingletonMeta):
+    def __init__(self):
+        self.cap_list = []
+        self.elements = []
+        self.num_workers = 1
+        self.n_elements = 0
+        self.worker_elements = dict()
+        self.n_used_elements = dict()
+    def set_cap_list(self, num_workers, cap_list, n_elements):
+        self.num_workers = num_workers
+        self.cap_list = cap_list
+        self.n_elements = n_elements
+        self.elements = list(range(n_elements))
+        random.shuffle(self.elements)
+        print(f"n_elements: {len(self.elements)}", flush=True)
+        for i in range(self.num_workers):
+            self.n_used_elements[i] = 0
+            per_worker = int(
+                math.ceil(len(self.elements) / float(self.num_workers)))
+            start = i * per_worker
+            end = min(start + per_worker, len(self.elements))
+            self.worker_elements[i] = self.elements[start:end]
+    def get_item(self, work_info):
+        if work_info is None:
+            worker_id = 0
+        else:
+            worker_id = work_info.id
+        idx = self.worker_elements[worker_id][
+            self.n_used_elements[worker_id] %
+            len(self.worker_elements[worker_id])]
+        self.n_used_elements[worker_id] += 1
+        return idx
+dataset_prog = DataSetProg()
+def filter_resolution(h,
+                      w,
+                      max_h_div_w_ratio=17 / 16,
+                      min_h_div_w_ratio=8 / 16):
+    if h / w <= max_h_div_w_ratio and h / w >= min_h_div_w_ratio:
+        return True
+    return False
+class T2V_dataset(Dataset):
+    def __init__(self, args, transform, temporal_sample, tokenizer,
+                 transform_topcrop):
+        self.data = args.data_merge_path
+        self.num_frames = args.num_frames
+        self.train_fps = args.train_fps
+        self.use_image_num = args.use_image_num
+        self.transform = transform
+        self.transform_topcrop = transform_topcrop
+        self.temporal_sample = temporal_sample
+        self.tokenizer = tokenizer
+        self.text_max_length = args.text_max_length
+        self.cfg = args.cfg
+        self.speed_factor = args.speed_factor
+        self.max_height = args.max_height
+        self.max_width = args.max_width
+        self.drop_short_ratio = args.drop_short_ratio
+        assert self.speed_factor >= 1
+        self.v_decoder = DecordInit()
+        self.video_length_tolerance_range = args.video_length_tolerance_range
+        self.support_Chinese = True
+        if "mt5" not in args.text_encoder_name:
+            self.support_Chinese = False
+        cap_list = self.get_cap_list()
+        assert len(cap_list) > 0
+        cap_list, self.sample_num_frames = self.define_frame_index(cap_list)
+        self.lengths = self.sample_num_frames
+        n_elements = len(cap_list)
+        dataset_prog.set_cap_list(args.dataloader_num_workers, cap_list,
+                                  n_elements)
+        print(f"video length: {len(dataset_prog.cap_list)}", flush=True)
+    def set_checkpoint(self, n_used_elements):
+        for i in range(len(dataset_prog.n_used_elements)):
+            dataset_prog.n_used_elements[i] = n_used_elements
+    def __len__(self):
+        return dataset_prog.n_elements
+    def __getitem__(self, idx):
+        data = self.get_data(idx)
+        return data
+    def get_data(self, idx):
+        path = dataset_prog.cap_list[idx]["path"]
+        if path.endswith(".mp4"):
+            return self.get_video(idx)
+        else:
+            return self.get_image(idx)
+    def get_video(self, idx):
+        video_path = dataset_prog.cap_list[idx]["path"]
+        assert os.path.exists(video_path), f"file {video_path} do not exist!"
+        frame_indices = dataset_prog.cap_list[idx]["sample_frame_index"]
+        torchvision_video, _, metadata = torchvision.io.read_video(
+            video_path, output_format="TCHW")
+        video = torchvision_video[frame_indices]
+        video = self.transform(video)
+        video = rearrange(video, "t c h w -> c t h w")
+        video = video.to(torch.uint8)
+        assert video.dtype == torch.uint8
+        h, w = video.shape[-2:]
+        assert (
+            h / w <= 17 / 16 and h / w >= 8 / 16
+        ), f"Only videos with a ratio (h/w) less than 17/16 and more than 8/16 are supported. But video ({video_path}) found ratio is {round(h / w, 2)} with the shape of {video.shape}"
+        video = video.float() / 127.5 - 1.0
+        text = dataset_prog.cap_list[idx]["cap"]
+        if not isinstance(text, list):
+            text = [text]
+        text = [random.choice(text)]
+        text = text[0] if random.random() > self.cfg else ""
+        text_tokens_and_mask = self.tokenizer(
+            text,
+            max_length=self.text_max_length,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        input_ids = text_tokens_and_mask["input_ids"]
+        cond_mask = text_tokens_and_mask["attention_mask"]
+        return dict(
+            pixel_values=video,
+            text=text,
+            input_ids=input_ids,
+            cond_mask=cond_mask,
+            path=video_path,
+        )
+    def get_image(self, idx):
+        image_data = dataset_prog.cap_list[
+            idx]  # [{'path': path, 'cap': cap}, ...]
+        image = Image.open(image_data["path"]).convert("RGB")  # [h, w, c]
+        image = torch.from_numpy(np.array(image))  # [h, w, c]
+        image = rearrange(image, "h w c -> c h w").unsqueeze(0)  #  [1 c h w]
+        # for i in image:
+        #     h, w = i.shape[-2:]
+        #     assert h / w <= 17 / 16 and h / w >= 8 / 16, f'Only image with a ratio (h/w) less than 17/16 and more than 8/16 are supported. But found ratio is {round(h / w, 2)} with the shape of {i.shape}'
+        image = (self.transform_topcrop(image) if "human_images"
+                 in image_data["path"] else self.transform(image)
+                 )  #  [1 C H W] -> num_img [1 C H W]
+        image = image.transpose(0, 1)  # [1 C H W] -> [C 1 H W]
+        image = image.float() / 127.5 - 1.0
+        caps = (image_data["cap"] if isinstance(image_data["cap"], list) else
+                [image_data["cap"]])
+        caps = [random.choice(caps)]
+        text = caps
+        input_ids, cond_mask = [], []
+        text = text[0] if random.random() > self.cfg else ""
+        text_tokens_and_mask = self.tokenizer(
+            text,
+            max_length=self.text_max_length,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        input_ids = text_tokens_and_mask["input_ids"]  # 1, l
+        cond_mask = text_tokens_and_mask["attention_mask"]  # 1, l
+        return dict(
+            pixel_values=image,
+            text=text,
+            input_ids=input_ids,
+            cond_mask=cond_mask,
+            path=image_data["path"],
+        )
+    def define_frame_index(self, cap_list):
+        new_cap_list = []
+        sample_num_frames = []
+        cnt_too_long = 0
+        cnt_too_short = 0
+        cnt_no_cap = 0
+        cnt_no_resolution = 0
+        cnt_resolution_mismatch = 0
+        cnt_movie = 0
+        cnt_img = 0
+        for i in cap_list:
+            path = i["path"]
+            cap = i.get("cap", None)
+            # ======no caption=====
+            if cap is None:
+                cnt_no_cap += 1
+                continue
+            if path.endswith(".mp4"):
+                # ======no fps and duration=====
+                duration = i.get("duration", None)
+                fps = i.get("fps", None)
+                if fps is None or duration is None:
+                    continue
+                # ======resolution mismatch=====
+                resolution = i.get("resolution", None)
+                if resolution is None:
+                    cnt_no_resolution += 1
+                    continue
+                else:
+                    if (resolution.get("height", None) is None
+                            or resolution.get("width", None) is None):
+                        cnt_no_resolution += 1
+                        continue
+                    height, width = i["resolution"]["height"], i["resolution"][
+                        "width"]
+                    aspect = self.max_height / self.max_width
+                    hw_aspect_thr = 1.5
+                    is_pick = filter_resolution(
+                        height,
+                        width,
+                        max_h_div_w_ratio=hw_aspect_thr * aspect,
+                        min_h_div_w_ratio=1 / hw_aspect_thr * aspect,
+                    )
+                    if not is_pick:
+                        print("resolution mismatch")
+                        cnt_resolution_mismatch += 1
+                        continue
+                # import ipdb;ipdb.set_trace()
+                i["num_frames"] = math.ceil(fps * duration)
+                # max 5.0 and min 1.0 are just thresholds to filter some videos which have suitable duration.
+                if i["num_frames"] / fps > self.video_length_tolerance_range * (
+                        self.num_frames / self.train_fps * self.speed_factor
+                ):  # too long video is not suitable for this training stage (self.num_frames)
+                    cnt_too_long += 1
+                    continue
+                # resample in case high fps, such as 50/60/90/144 -> train_fps(e.g, 24)
+                frame_interval = fps / self.train_fps
+                start_frame_idx = 0
+                frame_indices = np.arange(start_frame_idx, i["num_frames"],
+                                          frame_interval).astype(int)
+                # comment out it to enable dynamic frames training
+                if (len(frame_indices) < self.num_frames
+                        and random.random() < self.drop_short_ratio):
+                    cnt_too_short += 1
+                    continue
+                #  too long video will be temporal-crop randomly
+                if len(frame_indices) > self.num_frames:
+                    begin_index, end_index = self.temporal_sample(
+                        len(frame_indices))
+                    frame_indices = frame_indices[begin_index:end_index]
+                    # frame_indices = frame_indices[:self.num_frames]  # head crop
+                i["sample_frame_index"] = frame_indices.tolist()
+                new_cap_list.append(i)
+                i["sample_num_frames"] = len(
+                    i["sample_frame_index"]
+                )  # will use in dataloader(group sampler)
+                sample_num_frames.append(i["sample_num_frames"])
+            elif path.endswith(".jpg"):  # image
+                cnt_img += 1
+                new_cap_list.append(i)
+                i["sample_num_frames"] = 1
+                sample_num_frames.append(i["sample_num_frames"])
+            else:
+                raise NameError(
+                    f"Unknown file extension {path.split('.')[-1]}, only support .mp4 for video and .jpg for image"
+                )
+        # import ipdb;ipdb.set_trace()
+        main_print(
+            f"no_cap: {cnt_no_cap}, too_long: {cnt_too_long}, too_short: {cnt_too_short}, "
+            f"no_resolution: {cnt_no_resolution}, resolution_mismatch: {cnt_resolution_mismatch}, "
+            f"Counter(sample_num_frames): {Counter(sample_num_frames)}, cnt_movie: {cnt_movie}, cnt_img: {cnt_img}, "
+            f"before filter: {len(cap_list)}, after filter: {len(new_cap_list)}"
+        )
+        return new_cap_list, sample_num_frames
+    def decord_read(self, path, frame_indices):
+        decord_vr = self.v_decoder(path)
+        video_data = decord_vr.get_batch(frame_indices).asnumpy()
+        video_data = torch.from_numpy(video_data)
+        video_data = video_data.permute(0, 3, 1,
+                                        2)  # (T, H, W, C) -> (T C H W)
+        return video_data
+    def read_jsons(self, data):
+        cap_lists = []
+        with open(data, "r") as f:
+            folder_anno = [
+                i.strip().split(",") for i in f.readlines()
+                if len(i.strip()) > 0
+            ]
+        print(folder_anno)
+        for folder, anno in folder_anno:
+            with open(anno, "r") as f:
+                sub_list = json.load(f)
+            for i in range(len(sub_list)):
+                sub_list[i]["path"] = opj(folder, sub_list[i]["path"])
+            cap_lists += sub_list
+        return cap_lists
+    def get_cap_list(self):
+        cap_lists = self.read_jsons(self.data)
+        return cap_lists

fastvideo/dataset/transform.py ADDED Viewed

	@@ -0,0 +1,647 @@

+#This code file is from [https://github.com/hao-ai-lab/FastVideo], which is licensed under Apache License 2.0.
+import numbers
+import random
+import torch
+from PIL import Image
+def _is_tensor_video_clip(clip):
+    if not torch.is_tensor(clip):
+        raise TypeError("clip should be Tensor. Got %s" % type(clip))
+    if not clip.ndimension() == 4:
+        raise ValueError("clip should be 4D. Got %dD" % clip.dim())
+    return True
+def center_crop_arr(pil_image, image_size):
+    """
+    Center cropping implementation from ADM.
+    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
+    """
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size),
+                                     resample=Image.BOX)
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(tuple(
+        round(x * scale) for x in pil_image.size),
+                                 resample=Image.BICUBIC)
+    arr = np.array(pil_image)
+    crop_y = (arr.shape[0] - image_size) // 2
+    crop_x = (arr.shape[1] - image_size) // 2
+    return Image.fromarray(arr[crop_y:crop_y + image_size,
+                               crop_x:crop_x + image_size])
+def crop(clip, i, j, h, w):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+    """
+    if len(clip.size()) != 4:
+        raise ValueError("clip should be a 4D tensor")
+    return clip[..., i:i + h, j:j + w]
+def resize(clip, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(
+            f"target size should be tuple (height, width), instead got {target_size}"
+        )
+    return torch.nn.functional.interpolate(
+        clip,
+        size=target_size,
+        mode=interpolation_mode,
+        align_corners=True,
+        antialias=True,
+    )
+def resize_scale(clip, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(
+            f"target size should be tuple (height, width), instead got {target_size}"
+        )
+    H, W = clip.size(-2), clip.size(-1)
+    scale_ = target_size[0] / min(H, W)
+    return torch.nn.functional.interpolate(
+        clip,
+        scale_factor=scale_,
+        mode=interpolation_mode,
+        align_corners=True,
+        antialias=True,
+    )
+def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
+    """
+    Do spatial cropping and resizing to the video clip
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        i (int): i in (i,j) i.e coordinates of the upper left corner.
+        j (int): j in (i,j) i.e coordinates of the upper left corner.
+        h (int): Height of the cropped region.
+        w (int): Width of the cropped region.
+        size (tuple(int, int)): height and width of resized clip
+    Returns:
+        clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    clip = crop(clip, i, j, h, w)
+    clip = resize(clip, size, interpolation_mode)
+    return clip
+def center_crop(clip, crop_size):
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    th, tw = crop_size
+    if h < th or w < tw:
+        raise ValueError("height and width must be no smaller than crop_size")
+    i = int(round((h - th) / 2.0))
+    j = int(round((w - tw) / 2.0))
+    return crop(clip, i, j, th, tw)
+def center_crop_using_short_edge(clip):
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    if h < w:
+        th, tw = h, h
+        i = 0
+        j = int(round((w - tw) / 2.0))
+    else:
+        th, tw = w, w
+        i = int(round((h - th) / 2.0))
+        j = 0
+    return crop(clip, i, j, th, tw)
+def center_crop_th_tw(clip, th, tw, top_crop):
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    # import ipdb;ipdb.set_trace()
+    h, w = clip.size(-2), clip.size(-1)
+    tr = th / tw
+    if h / w > tr:
+        new_h = int(w * tr)
+        new_w = w
+    else:
+        new_h = h
+        new_w = int(h / tr)
+    i = 0 if top_crop else int(round((h - new_h) / 2.0))
+    j = int(round((w - new_w) / 2.0))
+    return crop(clip, i, j, new_h, new_w)
+def random_shift_crop(clip):
+    """
+    Slide along the long edge, with the short edge as crop size
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    if h <= w:
+        short_edge = h
+    else:
+        short_edge = w
+    th, tw = short_edge, short_edge
+    i = torch.randint(0, h - th + 1, size=(1, )).item()
+    j = torch.randint(0, w - tw + 1, size=(1, )).item()
+    return crop(clip, i, j, th, tw)
+def normalize_video(clip):
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimensions of clip tensor
+    Args:
+        clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+    Return:
+        clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+    """
+    _is_tensor_video_clip(clip)
+    if not clip.dtype == torch.uint8:
+        raise TypeError("clip tensor should have data type uint8. Got %s" %
+                        str(clip.dtype))
+    # return clip.float().permute(3, 0, 1, 2) / 255.0
+    return clip.float() / 255.0
+def normalize(clip, mean, std, inplace=False):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+        mean (tuple): pixel RGB mean. Size is (3)
+        std (tuple): pixel standard deviation. Size is (3)
+    Returns:
+        normalized clip (torch.tensor): Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    if not inplace:
+        clip = clip.clone()
+    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
+    # print(mean)
+    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
+    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
+    return clip
+def hflip(clip):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+    Returns:
+        flipped clip (torch.tensor): Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    return clip.flip(-1)
+class RandomCropVideo:
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: randomly cropped video clip.
+                size is (T, C, OH, OW)
+        """
+        i, j, h, w = self.get_params(clip)
+        return crop(clip, i, j, h, w)
+    def get_params(self, clip):
+        h, w = clip.shape[-2:]
+        th, tw = self.size
+        if h < th or w < tw:
+            raise ValueError(
+                f"Required crop size {(th, tw)} is larger than input image size {(h, w)}"
+            )
+        if w == tw and h == th:
+            return 0, 0, h, w
+        i = torch.randint(0, h - th + 1, size=(1, )).item()
+        j = torch.randint(0, w - tw + 1, size=(1, )).item()
+        return i, j, th, tw
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+class SpatialStrideCropVideo:
+    def __init__(self, stride):
+        self.stride = stride
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: cropped video clip by stride.
+                size is (T, C, OH, OW)
+        """
+        i, j, h, w = self.get_params(clip)
+        return crop(clip, i, j, h, w)
+    def get_params(self, clip):
+        h, w = clip.shape[-2:]
+        th, tw = h // self.stride * self.stride, w // self.stride * self.stride
+        return 0, 0, th, tw  # from top-left
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+class LongSideResizeVideo:
+    """
+    First use the long side,
+    then resize to the specified size
+    """
+    def __init__(
+        self,
+        size,
+        skip_low_resolution=False,
+        interpolation_mode="bilinear",
+    ):
+        self.size = size
+        self.skip_low_resolution = skip_low_resolution
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized video clip.
+                size is (T, C, 512, *) or (T, C, *, 512)
+        """
+        _, _, h, w = clip.shape
+        if self.skip_low_resolution and max(h, w) <= self.size:
+            return clip
+        if h > w:
+            w = int(w * self.size / h)
+            h = self.size
+        else:
+            h = int(h * self.size / w)
+            w = self.size
+        resize_clip = resize(clip,
+                             target_size=(h, w),
+                             interpolation_mode=self.interpolation_mode)
+        return resize_clip
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class CenterCropResizeVideo:
+    """
+    First use the short side for cropping length,
+    center crop video, then resize to the specified size
+    """
+    def __init__(
+        self,
+        size,
+        top_crop=False,
+        interpolation_mode="bilinear",
+    ):
+        if len(size) != 2:
+            raise ValueError(
+                f"size should be tuple (height, width), instead got {size}")
+        self.size = size
+        self.top_crop = top_crop
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        # clip_center_crop = center_crop_using_short_edge(clip)
+        clip_center_crop = center_crop_th_tw(clip,
+                                             self.size[0],
+                                             self.size[1],
+                                             top_crop=self.top_crop)
+        # import ipdb;ipdb.set_trace()
+        clip_center_crop_resize = resize(
+            clip_center_crop,
+            target_size=self.size,
+            interpolation_mode=self.interpolation_mode,
+        )
+        return clip_center_crop_resize
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class UCFCenterCropVideo:
+    """
+    First scale to the specified size in equal proportion to the short edge,
+    then center cropping
+    """
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(
+                    f"size should be tuple (height, width), instead got {size}"
+                )
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_resize = resize_scale(clip=clip,
+                                   target_size=self.size,
+                                   interpolation_mode=self.interpolation_mode)
+        clip_center_crop = center_crop(clip_resize, self.size)
+        return clip_center_crop
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class KineticsRandomCropResizeVideo:
+    """
+    Slide along the long edge, with the short edge as crop size. And resie to the desired size.
+    """
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(
+                    f"size should be tuple (height, width), instead got {size}"
+                )
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        clip_random_crop = random_shift_crop(clip)
+        clip_resize = resize(clip_random_crop, self.size,
+                             self.interpolation_mode)
+        return clip_resize
+class CenterCropVideo:
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(
+                    f"size should be tuple (height, width), instead got {size}"
+                )
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_center_crop = center_crop(clip, self.size)
+        return clip_center_crop
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class Normalize:
+    """
+    Normalize the video clip by mean subtraction and division by standard deviation
+    Args:
+        mean (3-tuple): pixel RGB mean
+        std (3-tuple): pixel RGB standard deviation
+        inplace (boolean): whether do in-place normalization
+    """
+    def __init__(self, mean, std, inplace=False):
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W)
+        """
+        return normalize(clip, self.mean, self.std, self.inplace)
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})"
+class Normalize255:
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    """
+    def __init__(self):
+        pass
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+        Return:
+            clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+        """
+        return normalize_video(clip)
+    def __repr__(self) -> str:
+        return self.__class__.__name__
+class RandomHorizontalFlipVideo:
+    """
+    Flip the video clip along the horizontal direction with a given probability
+    Args:
+        p (float): probability of the clip being flipped. Default value is 0.5
+    """
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Size is (T, C, H, W)
+        Return:
+            clip (torch.tensor): Size is (T, C, H, W)
+        """
+        if random.random() < self.p:
+            clip = hflip(clip)
+        return clip
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
+#  ------------------------------------------------------------
+#  ---------------------  Sampling  ---------------------------
+#  ------------------------------------------------------------
+class TemporalRandomCrop(object):
+    """Temporally crop the given frame indices at a random location.
+    Args:
+        size (int): Desired length of frames will be seen in the model.
+    """
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, total_frames):
+        rand_end = max(0, total_frames - self.size - 1)
+        begin_index = random.randint(0, rand_end)
+        end_index = min(begin_index + self.size, total_frames)
+        return begin_index, end_index
+class DynamicSampleDuration(object):
+    """Temporally crop the given frame indices at a random location.
+    Args:
+        size (int): Desired length of frames will be seen in the model.
+    """
+    def __init__(self, t_stride, extra_1):
+        self.t_stride = t_stride
+        self.extra_1 = extra_1
+    def __call__(self, t, h, w):
+        if self.extra_1:
+            t = t - 1
+        truncate_t_list = list(
+            range(t + 1))[t // 2:][::self.t_stride]  # need half at least
+        truncate_t = random.choice(truncate_t_list)
+        if self.extra_1:
+            truncate_t = truncate_t + 1
+        return 0, truncate_t
+if __name__ == "__main__":
+    import os
+    import numpy as np
+    import torchvision.io as io
+    from torchvision import transforms
+    from torchvision.utils import save_image
+    vframes, aframes, info = io.read_video(filename="./v_Archery_g01_c03.avi",
+                                           pts_unit="sec",
+                                           output_format="TCHW")
+    trans = transforms.Compose([
+        Normalize255(),
+        RandomHorizontalFlipVideo(),
+        UCFCenterCropVideo(512),
+        # NormalizeVideo(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5],
+                             std=[0.5, 0.5, 0.5],
+                             inplace=True),
+    ])
+    target_video_len = 32
+    frame_interval = 1
+    total_frames = len(vframes)
+    print(total_frames)
+    temporal_sample = TemporalRandomCrop(target_video_len * frame_interval)
+    # Sampling video frames
+    start_frame_ind, end_frame_ind = temporal_sample(total_frames)
+    # print(start_frame_ind)
+    # print(end_frame_ind)
+    assert end_frame_ind - start_frame_ind >= target_video_len
+    frame_indice = np.linspace(start_frame_ind,
+                               end_frame_ind - 1,
+                               target_video_len,
+                               dtype=int)
+    print(frame_indice)
+    select_vframes = vframes[frame_indice]
+    print(select_vframes.shape)
+    print(select_vframes.dtype)
+    select_vframes_trans = trans(select_vframes)
+    print(select_vframes_trans.shape)
+    print(select_vframes_trans.dtype)
+    select_vframes_trans_int = ((select_vframes_trans * 0.5 + 0.5) *
+                                255).to(dtype=torch.uint8)
+    print(select_vframes_trans_int.dtype)
+    print(select_vframes_trans_int.permute(0, 2, 3, 1).shape)
+    io.write_video("./test.avi",
+                   select_vframes_trans_int.permute(0, 2, 3, 1),
+                   fps=8)
+    for i in range(target_video_len):
+        save_image(
+            select_vframes_trans[i],
+            os.path.join("./test000", "%04d.png" % i),
+            normalize=True,
+            value_range=(-1, 1),
+        )

fastvideo/distill/__init__.py ADDED Viewed

File without changes

fastvideo/distill/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (225 Bytes). View file

fastvideo/distill/__pycache__/solver.cpython-312.pyc ADDED Viewed

Binary file (16.1 kB). View file

fastvideo/distill/discriminator.py ADDED Viewed

	@@ -0,0 +1,84 @@

+#This code file is from [https://github.com/hao-ai-lab/FastVideo], which is licensed under Apache License 2.0.
+import torch.nn as nn
+from diffusers.utils import logging
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class DiscriminatorHead(nn.Module):
+    def __init__(self, input_channel, output_channel=1):
+        super().__init__()
+        inner_channel = 1024
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(input_channel, inner_channel, 1, 1, 0),
+            nn.GroupNorm(32, inner_channel),
+            nn.LeakyReLU(
+                inplace=True
+            ),  # use LeakyReLu instead of GELU shown in the paper to save memory
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(inner_channel, inner_channel, 1, 1, 0),
+            nn.GroupNorm(32, inner_channel),
+            nn.LeakyReLU(
+                inplace=True
+            ),  # use LeakyReLu instead of GELU shown in the paper to save memory
+        )
+        self.conv_out = nn.Conv2d(inner_channel, output_channel, 1, 1, 0)
+    def forward(self, x):
+        b, twh, c = x.shape
+        t = twh // (30 * 53)
+        x = x.view(-1, 30 * 53, c)
+        x = x.permute(0, 2, 1)
+        x = x.view(b * t, c, 30, 53)
+        x = self.conv1(x)
+        x = self.conv2(x) + x
+        x = self.conv_out(x)
+        return x
+class Discriminator(nn.Module):
+    def __init__(
+        self,
+        stride=8,
+        num_h_per_head=1,
+        adapter_channel_dims=[3072],
+        total_layers=48,
+    ):
+        super().__init__()
+        adapter_channel_dims = adapter_channel_dims * (total_layers // stride)
+        self.stride = stride
+        self.num_h_per_head = num_h_per_head
+        self.head_num = len(adapter_channel_dims)
+        self.heads = nn.ModuleList([
+            nn.ModuleList([
+                DiscriminatorHead(adapter_channel)
+                for _ in range(self.num_h_per_head)
+            ]) for adapter_channel in adapter_channel_dims
+        ])
+    def forward(self, features):
+        outputs = []
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        assert len(features) == len(self.heads)
+        for i in range(0, len(features)):
+            for h in self.heads[i]:
+                # out = torch.utils.checkpoint.checkpoint(
+                #     create_custom_forward(h),
+                #     features[i],
+                #     use_reentrant=False
+                # )
+                out = h(features[i])
+                outputs.append(out)
+        return outputs

fastvideo/distill/solver.py ADDED Viewed

	@@ -0,0 +1,310 @@

+#This code file is from [https://github.com/hao-ai-lab/FastVideo], which is licensed under Apache License 2.0.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils import BaseOutput, logging
+from fastvideo.models.mochi_hf.pipeline_mochi import linear_quadratic_schedule
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class PCMFMSchedulerOutput(BaseOutput):
+    prev_sample: torch.FloatTensor
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1, ) * (len(x_shape) - 1)))
+class PCMFMScheduler(SchedulerMixin, ConfigMixin):
+    _compatibles = []
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+        pcm_timesteps: int = 50,
+        linear_quadratic=False,
+        linear_quadratic_threshold=0.025,
+        linear_range=0.5,
+    ):
+        if linear_quadratic:
+            linear_steps = int(num_train_timesteps * linear_range)
+            sigmas = linear_quadratic_schedule(num_train_timesteps,
+                                               linear_quadratic_threshold,
+                                               linear_steps)
+            sigmas = torch.tensor(sigmas).to(dtype=torch.float32)
+        else:
+            timesteps = np.linspace(1,
+                                    num_train_timesteps,
+                                    num_train_timesteps,
+                                    dtype=np.float32)[::-1].copy()
+            timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
+            sigmas = timesteps / num_train_timesteps
+            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+        self.euler_timesteps = (np.arange(1, pcm_timesteps + 1) *
+                                (num_train_timesteps //
+                                 pcm_timesteps)).round().astype(np.int64) - 1
+        self.sigmas = sigmas.numpy()[::-1][self.euler_timesteps]
+        self.sigmas = torch.from_numpy((self.sigmas[::-1].copy()))
+        self.timesteps = self.sigmas * num_train_timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to(
+            "cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    def scale_noise(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Forward process in flow-matching
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        sigma = self.sigmas[self.step_index]
+        sample = sigma * noise + (1.0 - sigma) * sample
+        return sample
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+    def set_timesteps(self,
+                      num_inference_steps: int,
+                      device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+        inference_indices = np.linspace(0,
+                                        self.config.pcm_timesteps,
+                                        num=num_inference_steps,
+                                        endpoint=False)
+        inference_indices = np.floor(inference_indices).astype(np.int64)
+        inference_indices = torch.from_numpy(inference_indices).long()
+        self.sigmas_ = self.sigmas[inference_indices]
+        timesteps = self.sigmas_ * self.config.num_train_timesteps
+        self.timesteps = timesteps.to(device=device)
+        self.sigmas_ = torch.cat(
+            [self.sigmas_,
+             torch.zeros(1, device=self.sigmas_.device)])
+        self._step_index = None
+        self._begin_index = None
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[PCMFMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (isinstance(timestep, int) or isinstance(timestep, torch.IntTensor)
+                or isinstance(timestep, torch.LongTensor)):
+            raise ValueError((
+                "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                " one of the `scheduler.timesteps` as a timestep."), )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        sample = sample.to(torch.float32)
+        sigma = self.sigmas_[self.step_index]
+        denoised = sample - model_output * sigma
+        derivative = (sample - denoised) / sigma
+        dt = self.sigmas_[self.step_index + 1] - sigma
+        prev_sample = sample + derivative * dt
+        prev_sample = prev_sample.to(model_output.dtype)
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample, )
+        return PCMFMSchedulerOutput(prev_sample=prev_sample)
+    def __len__(self):
+        return self.config.num_train_timesteps
+class EulerSolver:
+    def __init__(self, sigmas, timesteps=1000, euler_timesteps=50):
+        self.step_ratio = timesteps // euler_timesteps
+        self.euler_timesteps = (np.arange(1, euler_timesteps + 1) *
+                                self.step_ratio).round().astype(np.int64) - 1
+        self.euler_timesteps_prev = np.asarray(
+            [0] + self.euler_timesteps[:-1].tolist())
+        self.sigmas = sigmas[self.euler_timesteps]
+        self.sigmas_prev = np.asarray(
+            [sigmas[0]] + sigmas[self.euler_timesteps[:-1]].tolist()
+        )  # either use sigma0 or 0
+        self.euler_timesteps = torch.from_numpy(self.euler_timesteps).long()
+        self.euler_timesteps_prev = torch.from_numpy(
+            self.euler_timesteps_prev).long()
+        self.sigmas = torch.from_numpy(self.sigmas)
+        self.sigmas_prev = torch.from_numpy(self.sigmas_prev)
+    def to(self, device):
+        self.euler_timesteps = self.euler_timesteps.to(device)
+        self.euler_timesteps_prev = self.euler_timesteps_prev.to(device)
+        self.sigmas = self.sigmas.to(device)
+        self.sigmas_prev = self.sigmas_prev.to(device)
+        return self
+    def euler_step(self, sample, model_pred, timestep_index):
+        sigma = extract_into_tensor(self.sigmas, timestep_index,
+                                    model_pred.shape)
+        sigma_prev = extract_into_tensor(self.sigmas_prev, timestep_index,
+                                         model_pred.shape)
+        x_prev = sample + (sigma_prev - sigma) * model_pred
+        return x_prev
+    def euler_style_multiphase_pred(
+        self,
+        sample,
+        model_pred,
+        timestep_index,
+        multiphase,
+        is_target=False,
+    ):
+        inference_indices = np.linspace(0,
+                                        len(self.euler_timesteps),
+                                        num=multiphase,
+                                        endpoint=False)
+        inference_indices = np.floor(inference_indices).astype(np.int64)
+        inference_indices = (torch.from_numpy(inference_indices).long().to(
+            self.euler_timesteps.device))
+        expanded_timestep_index = timestep_index.unsqueeze(1).expand(
+            -1, inference_indices.size(0))
+        valid_indices_mask = expanded_timestep_index >= inference_indices
+        last_valid_index = valid_indices_mask.flip(dims=[1]).long().argmax(
+            dim=1)
+        last_valid_index = inference_indices.size(0) - 1 - last_valid_index
+        timestep_index_end = inference_indices[last_valid_index]
+        if is_target:
+            sigma = extract_into_tensor(self.sigmas_prev, timestep_index,
+                                        sample.shape)
+        else:
+            sigma = extract_into_tensor(self.sigmas, timestep_index,
+                                        sample.shape)
+        sigma_prev = extract_into_tensor(self.sigmas_prev, timestep_index_end,
+                                         sample.shape)
+        x_prev = sample + (sigma_prev - sigma) * model_pred
+        return x_prev, timestep_index_end

fastvideo/models/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

fastvideo/models/__pycache__/flash_attn_no_pad.cpython-310.pyc ADDED Viewed

Binary file (1.04 kB). View file

fastvideo/models/__pycache__/flash_attn_no_pad.cpython-312.pyc ADDED Viewed

Binary file (1.41 kB). View file

fastvideo/models/flash_attn_no_pad.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from einops import rearrange
+from flash_attn import flash_attn_varlen_qkvpacked_func
+from flash_attn.bert_padding import pad_input, unpad_input
+def flash_attn_no_pad(qkv,
+                      key_padding_mask,
+                      causal=False,
+                      dropout_p=0.0,
+                      softmax_scale=None):
+    # adapted from https://github.com/Dao-AILab/flash-attention/blob/13403e81157ba37ca525890f2f0f2137edf75311/flash_attn/flash_attention.py#L27
+    batch_size = qkv.shape[0]
+    seqlen = qkv.shape[1]
+    nheads = qkv.shape[-2]
+    x = rearrange(qkv, "b s three h d -> b s (three h d)")
+    x_unpad, indices, cu_seqlens, max_s, used_seqlens_in_batch = unpad_input(
+        x, key_padding_mask)
+    x_unpad = rearrange(x_unpad,
+                        "nnz (three h d) -> nnz three h d",
+                        three=3,
+                        h=nheads)
+    output_unpad = flash_attn_varlen_qkvpacked_func(
+        x_unpad,
+        cu_seqlens,
+        max_s,
+        dropout_p,
+        softmax_scale=softmax_scale,
+        causal=causal,
+    )
+    output = rearrange(
+        pad_input(rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices,
+                  batch_size, seqlen),
+        "b s (h d) -> b s h d",
+        h=nheads,
+    )
+    return output

fastvideo/reward_model/clip_score.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import numpy as np
+import torch
+from torchvision import transforms
+import torch.nn.functional as F
+import clip
+from PIL import Image
+from typing import List, Tuple, Union
+from PIL import Image
+import os
+from open_clip import create_model_from_pretrained, get_tokenizer
+import argparse
+@torch.no_grad()
+def calculate_clip_score(prompts, images, clip_model, device):
+    texts = clip.tokenize(prompts, truncate=True).to(device=device)
+    image_features = clip_model.encode_image(images)
+    text_features = clip_model.encode_text(texts)
+    scores = F.cosine_similarity(image_features, text_features)
+    return scores
+class CLIPScoreRewardModel():
+    def __init__(self, clip_model_path, device, http_proxy=None, https_proxy=None, clip_model_type='ViT-H-14'):
+        super().__init__()
+        if http_proxy:
+            os.environ["http_proxy"] = http_proxy
+        if https_proxy:
+            os.environ["https_proxy"] = https_proxy
+        self.clip_model_path = clip_model_path
+        self.clip_model_type = clip_model_type
+        self.device = device
+        self.load_model()
+    def load_model(self, logger=None):
+        self.model, self.preprocess = create_model_from_pretrained(self.clip_model_path)
+        self.tokenizer = get_tokenizer(self.clip_model_type)
+        self.model.to(self.device)
+    # calculate clip score directly, such as for rerank
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompts: Union[str, List[str]],
+        images: List[Image.Image]
+    ) -> List[float]:
+        if isinstance(prompts, str):
+            prompts = [prompts] * len(images)
+        if len(prompts) != len(images):
+            raise ValueError("prompts must have the same length as images")
+        scores = []
+        for prompt, image in zip(prompts, images):
+            image_proc = self.preprocess(image).unsqueeze(0).to(self.device)
+            text = self.tokenizer(
+                [prompt],
+                context_length=self.model.context_length
+            ).to(self.device)
+            image_features = self.model.encode_image(image_proc)
+            text_features = self.model.encode_text(text)
+            image_features = F.normalize(image_features, dim=-1)
+            text_features = F.normalize(text_features, dim=-1)
+            clip_score = image_features @ text_features.T
+            scores.append(clip_score.item())
+        return scores
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="PickScore Reward Model")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on (e.g., 'cuda', 'cpu')")
+    parser.add_argument("--http_proxy", type=str, default=None, help="HTTP proxy URL")
+    parser.add_argument("--https_proxy", type=str, default=None, help="HTTPS proxy URL")
+    args = parser.parse_args()
+    # Example usage
+    clip_model_path = 'hf-hub:apple/DFN5B-CLIP-ViT-H-14-384'
+    reward_model = CLIPScoreRewardModel(
+        clip_model_path,
+        device=args.device,
+        http_proxy=args.http_proxy,
+        https_proxy=args.https_proxy
+    )
+    image_path = "assets/reward_demo.jpg"
+    prompt = "A 3D rendering of anime schoolgirls with a sad expression underwater, surrounded by dramatic lighting."
+    image = Image.open(image_path).convert("RGB")
+    clip_score = reward_model(prompt, [image])
+    print(f"CLIP Score: {clip_score}")

fastvideo/reward_model/hps_score.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from typing import Union, List
+import argparse
+import torch
+from PIL import Image
+from HPSv2.hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
+class HPSClipRewardModel(object):
+    def __init__(self, device, clip_ckpt_path, hps_ckpt_path, model_name='ViT-H-14'):
+        self.device = device
+        self.clip_ckpt_path = clip_ckpt_path
+        self.hps_ckpt_path = hps_ckpt_path
+        self.model_name = model_name
+        self.reward_model, self.text_processor, self.img_processor = self.build_reward_model()
+    def build_reward_model(self):
+        model, preprocess_train, img_preprocess_val = create_model_and_transforms(
+            self.model_name,
+            self.clip_ckpt_path,
+            precision='amp',
+            device=self.device,
+            jit=False,
+            force_quick_gelu=False,
+            force_custom_text=False,
+            force_patch_dropout=False,
+            force_image_size=None,
+            pretrained_image=False,
+            image_mean=None,
+            image_std=None,
+            light_augmentation=True,
+            aug_cfg={},
+            output_dict=True,
+            with_score_predictor=False,
+            with_region_predictor=False
+        )
+        # Convert device name to proper format
+        if isinstance(self.device, int):
+            ml_device = str(self.device)
+        else:
+            ml_device = self.device
+        if not ml_device.startswith('cuda'):
+            ml_device = f'cuda:{ml_device}' if ml_device.isdigit() else ml_device
+        checkpoint = torch.load(self.hps_ckpt_path, map_location=ml_device)
+        model.load_state_dict(checkpoint['state_dict'])
+        text_processor = get_tokenizer(self.model_name)
+        reward_model = model.to(self.device)
+        reward_model.eval()
+        return reward_model, text_processor, img_preprocess_val
+    @torch.no_grad()
+    def __call__(
+            self,
+            images: Union[Image.Image, List[Image.Image]],
+            texts: Union[str, List[str]],
+    ):
+        if isinstance(images, Image.Image):
+            images = [images]
+        if isinstance(texts, str):
+            texts = [texts]
+        rewards = []
+        for image, text in zip(images, texts):
+            image = self.img_processor(image).unsqueeze(0).to(self.device, non_blocking=True)
+            text = self.text_processor([text]).to(device=self.device, non_blocking=True)
+            with torch.amp.autocast('cuda'):
+                outputs = self.reward_model(image, text)
+                image_features, text_features = outputs["image_features"], outputs["text_features"]
+                logits_per_image = image_features @ text_features.T
+                hps_score = torch.diagonal(logits_per_image)
+                # reward is a tensor of shape (1,) --> list
+                rewards.append(hps_score.float().cpu().item())
+        return rewards

fastvideo/reward_model/image_reward.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Image-Reward: Copyied from https://github.com/THUDM/ImageReward
+import os
+from typing import Union, List
+from PIL import Image
+import torch
+try:
+    import ImageReward as RM
+except:
+    raise Warning("ImageReward is required to be installed (`pip install image-reward`) when using ImageReward for post-training.")
+class ImageRewardModel(object):
+    def __init__(self, model_name, device, http_proxy=None, https_proxy=None, med_config=None):
+        if http_proxy:
+            os.environ["http_proxy"] = http_proxy
+        if https_proxy:
+            os.environ["https_proxy"] = https_proxy
+        self.model_name = model_name if model_name else "ImageReward-v1.0"
+        self.device = device
+        self.med_config = med_config
+        self.build_reward_model()
+    def build_reward_model(self):
+        self.model = RM.load(self.model_name, device=self.device, med_config=self.med_config)
+    @torch.no_grad()
+    def __call__(
+            self,
+            images,
+            texts,
+    ):
+        if isinstance(texts, str):
+            texts = [texts] * len(images)
+        rewards = []
+        for image, text in zip(images, texts):
+            ranking, reward = self.model.inference_rank(text, [image])
+            rewards.append(reward)
+        return rewards

fastvideo/reward_model/pick_score.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import os
+import torch
+import argparse
+from typing import List, Tuple, Union
+from transformers import AutoProcessor, AutoModel
+from PIL import Image
+class PickScoreRewardModel(object):
+    def __init__(self, device: str = "cuda", http_proxy=None, https_proxy=None, mean=18.0, std=8.0):
+        """
+        Initialize PickScore reward model.
+        Args:
+            device: Device to run the model on ('cuda' or 'cpu')
+        """
+        if http_proxy:
+            os.environ["http_proxy"] = http_proxy
+        if https_proxy:
+            os.environ["https_proxy"] = https_proxy
+        self.device = device
+        self.processor_name_or_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
+        self.model_pretrained_name_or_path = "yuvalkirstain/PickScore_v1"
+        self.mean = mean
+        self.std = std
+        # Initialize model and processor
+        self.processor = AutoProcessor.from_pretrained(self.processor_name_or_path)
+        self.model = AutoModel.from_pretrained(self.model_pretrained_name_or_path).eval().to(device)
+    @torch.no_grad()
+    def __call__(
+            self,
+            images: List[Image.Image],
+            prompts: Union[str, List[str]],
+    ) -> Tuple[List[float], List[float]]:
+        """
+        Calculate probabilities and scores for images given a prompt.
+        Args:
+            prompts: Text prompt to evaluate images against
+            images: List of PIL Images to evaluate
+        Returns:
+            Tuple of (probabilities, scores) for each image
+        """
+        if isinstance(prompts, str):
+            prompts = [prompts] * len(images)
+        if len(prompts) != len(images):
+            raise ValueError("prompts must have the same length as images")
+        scores = []
+        for prompt, image in zip(prompts, images):
+            # Preprocess images
+            image_inputs = self.processor(
+                images=[image],
+                padding=True,
+                truncation=True,
+                max_length=77,
+                return_tensors="pt",
+            ).to(self.device)
+            # Preprocess text
+            text_inputs = self.processor(
+                text=prompt,
+                padding=True,
+                truncation=True,
+                max_length=77,
+                return_tensors="pt",
+            ).to(self.device)
+            # Get embeddings
+            image_embs = self.model.get_image_features(**image_inputs)
+            image_embs = image_embs / torch.norm(image_embs, dim=-1, keepdim=True)
+            text_embs = self.model.get_text_features(**text_inputs)
+            text_embs = text_embs / torch.norm(text_embs, dim=-1, keepdim=True)
+            # Calculate scores
+            score = self.model.logit_scale.exp() * (text_embs @ image_embs.T)[0]
+            score = (score - self.mean) / self.std
+            scores.extend(score.cpu().tolist())
+        return scores
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="PickScore Reward Model")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on (e.g., 'cuda', 'cpu')")
+    parser.add_argument("--http_proxy", type=str, default=None, help="HTTP proxy URL")
+    parser.add_argument("--https_proxy", type=str, default=None, help="HTTPS proxy URL")
+    args = parser.parse_args()
+    # Example usage
+    reward_model = PickScoreRewardModel(
+        device=args.device,
+        http_proxy=args.http_proxy,
+        https_proxy=args.https_proxy,
+    )
+    pil_images = [Image.open("assets/reward_demo.jpg")]
+    prompt = "A 3D rendering of anime schoolgirls with a sad expression underwater, surrounded by dramatic lighting."
+    scores = reward_model(pil_images, [prompt] * len(pil_images))
+    scores = [(s * reward_model.std + reward_model.mean) / 100.0 for s in scores]
+    print(f"scores: {scores}")

fastvideo/reward_model/unified_reward.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import argparse
+import base64
+import os
+import re
+import requests
+import time
+import concurrent.futures
+from io import BytesIO
+from typing import List, Optional, Union
+from PIL import Image
+QUESTION_TEMPLATE_SEMANTIC = (
+    "You are presented with a generated image and its associated text caption. Your task is to analyze the image across multiple dimensions in relation to the caption. Specifically:\n\n"
+    "1. Evaluate each word in the caption based on how well it is visually represented in the image. Assign a numerical score to each word using the format:\n"
+    "   Word-wise Scores: [[\"word1\", score1], [\"word2\", score2], ..., [\"wordN\", scoreN], [\"[No_mistakes]\", scoreM]]\n"
+    "   - A higher score indicates that the word is less well represented in the image.\n"
+    "   - The special token [No_mistakes] represents whether all elements in the caption were correctly depicted. A high score suggests no mistakes; a low score suggests missing or incorrect elements.\n\n"
+    "2. Provide overall assessments for the image along the following axes (each rated from 1 to 5):\n"
+    "- Alignment Score: How well the image matches the caption in terms of content.\n"
+    "- Coherence Score: How logically consistent the image is (absence of visual glitches, object distortions, etc.).\n"
+    "- Style Score: How aesthetically appealing the image looks, regardless of caption accuracy.\n\n"
+    "Output your evaluation using the format below:\n\n"
+    "---\n\n"
+    "Word-wise Scores: [[\"word1\", score1], ..., [\"[No_mistakes]\", scoreM]]\n\n"
+    "Alignment Score (1-5): X\n"
+    "Coherence Score (1-5): Y\n"
+    "Style Score (1-5): Z\n\n"
+    "Your task is provided as follows:\nText Caption: [{}]"
+)
+QUESTION_TEMPLATE_SCORE = (
+    "You are given a text caption and a generated image based on that caption. Your task is to evaluate this image based on two key criteria:\n"
+    "1. Alignment with the Caption: Assess how well this image aligns with the provided caption. Consider the accuracy of depicted objects, their relationships, and attributes as described in the caption.\n"
+    "2. Overall Image Quality: Examine the visual quality of this image, including clarity, detail preservation, color accuracy, and overall aesthetic appeal.\n"
+    "Extract key elements from the provided text caption, evaluate their presence in the generated image using the format: \'element (type): value\' (where value=0 means not generated, and value=1 means generated), and assign a score from 1 to 5 after \'Final Score:\'.\n"
+    "Your task is provided as follows:\nText Caption: [{}]"
+)
+class VLMessageClient:
+    def __init__(self, api_url):
+        self.api_url = api_url
+        self._session = None
+    @property
+    def session(self):
+        if self._session is None:
+            self._session = requests.Session()
+        return self._session
+    def close(self):
+        """Close the session if it exists."""
+        if self._session is not None:
+            self._session.close()
+            self._session = None
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+    def _encode_image_base64(self, image):
+        if isinstance(image, str):
+            with Image.open(image) as img:
+                img = img.convert("RGB")
+                buffered = BytesIO()
+                img.save(buffered, format="JPEG", quality=95)
+                return base64.b64encode(buffered.getvalue()).decode("utf-8")
+        elif isinstance(image, Image.Image):
+            buffered = BytesIO()
+            image.save(buffered, format="JPEG", quality=95)
+            return base64.b64encode(buffered.getvalue()).decode("utf-8")
+        else:
+            raise ValueError(f"Unsupported image type: {type(image)}")
+    def build_messages(self, item, image_root=""):
+        if isinstance(item['image'], str):
+            image_path = os.path.join(image_root, item['image'])
+            return [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": f"file://{image_path}"}},
+                        {
+                            "type": "text",
+                            "text": f"{item['question']}"
+                        }
+                    ]
+                }
+            ]
+        assert isinstance(item['image'], Image.Image), f"image must be a PIL.Image.Image, but got {type(item['image'])}"
+        return [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "pil_image", "pil_image": item['image']},
+                    {
+                        "type": "text",
+                        "text": f"{item['question']}"
+                    }
+                ]
+            }
+        ]
+    def format_messages(self, messages):
+        formatted = []
+        for msg in messages:
+            new_msg = {"role": msg["role"], "content": []}
+            if msg["role"] == "system":
+                new_msg["content"] = msg["content"][0]["text"]
+            else:
+                for part in msg["content"]:
+                    if part["type"] == "image_url":
+                        img_path = part["image_url"]["url"].replace("file://", "")
+                        base64_image = self._encode_image_base64(img_path)
+                        new_part = {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
+                        }
+                        new_msg["content"].append(new_part)
+                    elif part["type"] == "pil_image":
+                        base64_image = self._encode_image_base64(part["pil_image"])
+                        new_part = {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
+                        }
+                        new_msg["content"].append(new_part)
+                    else:
+                        new_msg["content"].append(part)
+            formatted.append(new_msg)
+        return formatted
+    def process_item(self, item, image_root=""):
+        max_retries = 3
+        attempt = 0
+        result = None
+        while attempt < max_retries:
+            try:
+                attempt += 1
+                raw_messages = self.build_messages(item, image_root)
+                formatted_messages = self.format_messages(raw_messages)
+                payload = {
+                    "model": "UnifiedReward",
+                    "messages": formatted_messages,
+                    "temperature": 0,
+                    "max_tokens": 4096,
+                }
+                response = self.session.post(
+                    f"{self.api_url}/v1/chat/completions",
+                    json=payload,
+                    timeout=30 + attempt*5
+                )
+                response.raise_for_status()
+                output = response.json()["choices"][0]["message"]["content"]
+                result = {
+                    "question": item["question"],
+                    "image_path": item["image"] if isinstance(item["image"], str) else "PIL_Image",
+                    "model_output": output,
+                    "attempt": attempt,
+                    "success": True
+                }
+                break
+            except Exception as e:
+                if attempt == max_retries:
+                    result = {
+                        "question": item["question"],
+                        "image_path": item["image"] if isinstance(item["image"], str) else "PIL_Image",
+                        "error": str(e),
+                        "attempt": attempt,
+                        "success": False
+                    }
+                    raise(e)
+                else:
+                    sleep_time = min(2 ** attempt, 10)
+                    time.sleep(sleep_time)
+        return result, result.get("success", False)
+class UnifiedRewardModel(object):
+    def __init__(self, api_url, default_question_type="score", num_workers=8):
+        self.api_url = api_url
+        self.num_workers = num_workers
+        self.default_question_type = default_question_type
+        self.question_template_score = QUESTION_TEMPLATE_SCORE
+        self.question_template_semantic = QUESTION_TEMPLATE_SEMANTIC
+        # self.client = VLMessageClient(self.api_url)
+    def question_constructor(self, prompt, question_type=None):
+        if question_type is None:
+            question_type = self.default_question_type
+        if question_type == "score":
+            return self.question_template_score.format(prompt)
+        elif question_type == "semantic":
+            return self.question_template_semantic.format(prompt)
+        else:
+            raise ValueError(f"Invalid question type: {question_type}")
+    def _process_item_wrapper(self, client, image, question):
+        try:
+            item = {
+                "image": image,
+                "question": question,
+            }
+            result, _ = client.process_item(item)
+            return result
+        except Exception as e:
+            print(f"Encountered error in unified reward model processing: {str(e)}")
+            return None
+    def _reset_proxy(self):
+        os.environ.pop('http_proxy', None)
+        os.environ.pop('https_proxy', None)
+    def __call__(self,
+            images: Union[List[Image.Image], List[str]],
+            prompts: Union[str, List[str]],
+            question_type: Optional[str] = None,
+    ):
+        # Reset proxy, otherwise cannot access the server url
+        self._reset_proxy()
+        if isinstance(prompts, str):
+            prompts = [prompts] * len(images)
+        if len(prompts) != len(images):
+            raise ValueError("prompts must have the same length as images")
+        with VLMessageClient(self.api_url) as client:
+            questions = [self.question_constructor(prompt, question_type) for prompt in prompts]
+            # Initialize results and successes lists with None and False
+            results = [None] * len(images)
+            successes = [False] * len(images)
+            with concurrent.futures.ThreadPoolExecutor(max_workers=self.num_workers) as executor:
+                # Submit all tasks and keep track of their order
+                future_to_idx = {
+                    executor.submit(self._process_item_wrapper, client, image, question): idx
+                    for idx, (image, question) in enumerate(zip(images, questions))
+                }
+                # Get results in completion order but store them in the correct position
+                for future in concurrent.futures.as_completed(future_to_idx):
+                    idx = future_to_idx[future]
+                    result = future.result()
+                    if result is not None and result.get("success", False):
+                        output = result.get("model_output", "")
+                        score = self.score_parser(output, question_type)
+                        results[idx] = score
+                        successes[idx] = True
+                    else:
+                        results[idx] = None
+                        successes[idx] = False
+            return results, successes
+    def score_parser(self, text, question_type=None):
+        if question_type is None:
+            question_type = self.default_question_type
+        if question_type == "score":
+            return self.extract_final_score(text)
+        elif question_type == "semantic":
+            return self.extract_alignment_score(text)
+        else:
+            raise ValueError(f"Invalid question type: {question_type}")
+    @staticmethod
+    def extract_alignment_score(text):
+        """
+        Extract Alignment Score (1-5) from the evaluation text.
+        Returns a float score if found, None otherwise.
+        """
+        match = re.search(r'Alignment Score \(1-5\):\s*([0-5](?:\.\d+)?)', text)
+        if match:
+            return float(match.group(1))
+        else:
+            return None
+    @staticmethod
+    def extract_final_score(text):
+        """
+        Extract Final Score from the evaluation text.
+        Returns a float score if found, None otherwise.
+        Example input:
+            'ocean (location): 0
+            clouds (object): 1
+            birds (animal): 0
+            day time (attribute): 1
+            low depth field effect (attribute): 1
+            painting (attribute): 1
+            Final Score: 2.33'
+        """
+        match = re.search(r'Final Score:\s*([0-5](?:\.\d+)?)', text)
+        if match:
+            return float(match.group(1))
+        else:
+            return None
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--api_url", type=str)
+    parser.add_argument("--max_workers", type=int)
+    args = parser.parse_args()
+    unified_reward_model = UnifiedRewardModel(args.api_url, num_workers=args.max_workers)
+    img_path = "assets/reward_demo.jpg"
+    images = [
+        Image.open(img_path).convert("RGB")
+        for i in range(1, 5)
+    ] * 4
+    prompts = "A 3D rendering of anime schoolgirls with a sad expression underwater, surrounded by dramatic lighting."
+    results, successes = unified_reward_model(images, prompts, question_type="semantic")
+    print(results)
+    print(successes)
+    # # 并发测试
+    # proc_num = 32
+    # for i in range(5):
+    #     with concurrent.futures.ThreadPoolExecutor(max_workers=proc_num) as executor:
+    #         futures = [executor.submit(unified_reward_model, images, prompts, question_type="semantic") for _ in range(proc_num)]
+    #         results = [future.result() for future in concurrent.futures.as_completed(futures)]
+    #     print(results)

fastvideo/reward_model/utils.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import concurrent.futures
+import random
+def _compute_single_reward(reward_model, images, input_prompts):
+    """Compute reward for a single reward model."""
+    reward_model_name = type(reward_model).__name__
+    try:
+        if reward_model_name == 'HPSClipRewardModel':
+            rewards = reward_model(images, input_prompts)
+            successes = [1] * len(rewards)
+        elif reward_model_name == 'CLIPScoreRewardModel':
+            rewards = reward_model(input_prompts, images)
+            successes = [1] * len(rewards)
+        elif reward_model_name == 'ImageRewardModel':
+            rewards = reward_model(images, input_prompts)
+            successes = [1] * len(rewards)
+        elif reward_model_name == 'UnifiedRewardModel':
+            rewards, successes_bool = reward_model(images, input_prompts)
+            rewards = [float(reward) if success else 0.0 for reward, success in zip(rewards, successes_bool)]
+            successes = [1 if success else 0 for success in successes_bool]
+        elif reward_model_name == 'PickScoreRewardModel':
+            rewards = reward_model(images, input_prompts)
+            successes = [1] * len(rewards)
+        else:
+            raise ValueError(f"Unknown reward model: {reward_model_name}")
+        # Verify the length of results matches input
+        assert len(rewards) == len(input_prompts), \
+            f"Length mismatch in {reward_model_name}: rewards ({len(rewards)}) != input_prompts ({len(input_prompts)})"
+        assert len(successes) == len(input_prompts), \
+            f"Length mismatch in {reward_model_name}: successes ({len(successes)}) != input_prompts ({len(input_prompts)})"
+        return rewards, successes
+    except Exception as e:
+        raise ValueError(f"Error in _compute_single_reward with {reward_model_name}: {e}") from e
+def compute_reward(images, input_prompts, reward_models, reward_weights):
+        assert (
+            len(images) == len(input_prompts)
+        ), f"length of `images` ({len(images)}) must be equal to length of `input_prompts` ({len(input_prompts)})"
+        # Initialize results
+        rewards_dict = {}
+        successes_dict = {}
+        # Create a thread pool for parallel reward computation
+        with concurrent.futures.ThreadPoolExecutor(max_workers=len(reward_models)) as executor:
+            # Submit all reward computation tasks
+            future_to_model = {
+                executor.submit(_compute_single_reward, reward_model, images, input_prompts): reward_model
+                for reward_model in reward_models
+            }
+            # Process results as they complete
+            for future in concurrent.futures.as_completed(future_to_model):
+                reward_model = future_to_model[future]
+                model_name = type(reward_model).__name__
+                try:
+                    model_rewards, model_successes = future.result()
+                    rewards_dict[model_name] = model_rewards
+                    successes_dict[model_name] = model_successes
+                except Exception as e:
+                    print(f"Error computing reward with {model_name}: {e}")
+                    rewards_dict[model_name] = [0.0] * len(input_prompts)
+                    successes_dict[model_name] = [0] * len(input_prompts)
+                    continue
+        # Merge rewards based on weights
+        merged_rewards = [0.0] * len(input_prompts)
+        merged_successes = [0] * len(input_prompts)
+        # First check if all models are successful for each sample
+        for i in range(len(merged_rewards)):
+            all_success = True
+            for model_name in reward_weights.keys():
+                if model_name in successes_dict and successes_dict[model_name][i] != 1:
+                    all_success = False
+                    break
+            if all_success:
+                # Only compute weighted sum if all models are successful
+                for model_name, weight in reward_weights.items():
+                    if model_name in rewards_dict:
+                        merged_rewards[i] += rewards_dict[model_name][i] * weight
+                merged_successes[i] = 1
+        return merged_rewards, merged_successes, rewards_dict, successes_dict
+def balance_pos_neg(samples, use_random=False):
+    """Balance positive and negative samples distribution in the samples list."""
+    if use_random:
+        return random.sample(samples, len(samples))
+    else:
+        positive_samples = [sample for sample in samples if sample['advantages'].item() > 0]
+        negative_samples = [sample for sample in samples if sample['advantages'].item() < 0]
+        positive_samples = random.sample(positive_samples, len(positive_samples))
+        negative_samples = random.sample(negative_samples, len(negative_samples))
+        num_positive = len(positive_samples)
+        num_negative = len(negative_samples)
+        balanced_samples = []
+        if num_positive < num_negative:
+            smaller_group = positive_samples
+            larger_group = negative_samples
+        else:
+            smaller_group = negative_samples
+            larger_group = positive_samples
+        for i in range(len(smaller_group)):
+            balanced_samples.append(smaller_group[i])
+            balanced_samples.append(larger_group[i])
+        # If there are remaining samples in the larger group, add them
+        remaining_samples = larger_group[len(smaller_group):]
+        balanced_samples.extend(remaining_samples)
+        return balanced_samples

fastvideo/utils/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

fastvideo/utils/checkpoint.py ADDED Viewed

	@@ -0,0 +1,314 @@

+#This code file is from [https://github.com/hao-ai-lab/FastVideo], which is licensed under Apache License 2.0.
+import json
+import os
+import torch
+import torch.distributed.checkpoint as dist_cp
+from peft import get_peft_model_state_dict
+from safetensors.torch import load_file, save_file
+from torch.distributed.checkpoint.default_planner import (DefaultLoadPlanner,
+                                                          DefaultSavePlanner)
+from torch.distributed.checkpoint.optimizer import \
+    load_sharded_optimizer_state_dict
+from torch.distributed.fsdp import (FullOptimStateDictConfig,
+                                    FullStateDictConfig)
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import StateDictType
+from fastvideo.utils.logging_ import main_print
+def save_checkpoint_optimizer(model,
+                              optimizer,
+                              rank,
+                              output_dir,
+                              step,
+                              discriminator=False):
+    with FSDP.state_dict_type(
+            model,
+            StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(offload_to_cpu=True, rank0_only=True),
+            FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True),
+    ):
+        cpu_state = model.state_dict()
+        optim_state = FSDP.optim_state_dict(
+            model,
+            optimizer,
+        )
+    # todo move to get_state_dict
+    save_dir = os.path.join(output_dir, f"checkpoint-{step}")
+    os.makedirs(save_dir, exist_ok=True)
+    # save using safetensors
+    if rank <= 0 and not discriminator:
+        weight_path = os.path.join(save_dir,
+                                   "diffusion_pytorch_model.safetensors")
+        save_file(cpu_state, weight_path)
+        config_dict = dict(model.config)
+        config_dict.pop('dtype')
+        config_path = os.path.join(save_dir, "config.json")
+        # save dict as json
+        with open(config_path, "w") as f:
+            json.dump(config_dict, f, indent=4)
+        optimizer_path = os.path.join(save_dir, "optimizer.pt")
+        torch.save(optim_state, optimizer_path)
+    else:
+        weight_path = os.path.join(save_dir,
+                                   "discriminator_pytorch_model.safetensors")
+        save_file(cpu_state, weight_path)
+        optimizer_path = os.path.join(save_dir, "discriminator_optimizer.pt")
+        torch.save(optim_state, optimizer_path)
+    main_print(f"--> checkpoint saved at step {step}")
+def save_checkpoint(transformer, rank, output_dir, step, epoch):
+    main_print(f"--> saving checkpoint at step {step}")
+    with FSDP.state_dict_type(
+            transformer,
+            StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(offload_to_cpu=True, rank0_only=True),
+    ):
+        cpu_state = transformer.state_dict()
+    # todo move to get_state_dict
+    if rank <= 0:
+        save_dir = os.path.join(output_dir, f"checkpoint-{step}-{epoch}")
+        os.makedirs(save_dir, exist_ok=True)
+        # save using safetensors
+        weight_path = os.path.join(save_dir,
+                                   "diffusion_pytorch_model.safetensors")
+        save_file(cpu_state, weight_path)
+        config_dict = dict(transformer.config)
+        if "dtype" in config_dict:
+            del config_dict["dtype"]  # TODO
+        config_path = os.path.join(save_dir, "config.json")
+        # save dict as json
+        with open(config_path, "w") as f:
+            json.dump(config_dict, f, indent=4)
+    main_print(f"--> checkpoint saved at step {step}")
+def save_checkpoint_generator_discriminator(
+    model,
+    optimizer,
+    discriminator,
+    discriminator_optimizer,
+    rank,
+    output_dir,
+    step,
+):
+    with FSDP.state_dict_type(
+            model,
+            StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(offload_to_cpu=True, rank0_only=True),
+    ):
+        cpu_state = model.state_dict()
+    # todo move to get_state_dict
+    save_dir = os.path.join(output_dir, f"checkpoint-{step}")
+    os.makedirs(save_dir, exist_ok=True)
+    hf_weight_dir = os.path.join(save_dir, "hf_weights")
+    os.makedirs(hf_weight_dir, exist_ok=True)
+    # save using safetensors
+    if rank <= 0:
+        config_dict = dict(model.config)
+        config_path = os.path.join(hf_weight_dir, "config.json")
+        # save dict as json
+        with open(config_path, "w") as f:
+            json.dump(config_dict, f, indent=4)
+        weight_path = os.path.join(hf_weight_dir,
+                                   "diffusion_pytorch_model.safetensors")
+        save_file(cpu_state, weight_path)
+    main_print(f"--> saved HF weight checkpoint at path {hf_weight_dir}")
+    model_weight_dir = os.path.join(save_dir, "model_weights_state")
+    os.makedirs(model_weight_dir, exist_ok=True)
+    model_optimizer_dir = os.path.join(save_dir, "model_optimizer_state")
+    os.makedirs(model_optimizer_dir, exist_ok=True)
+    with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
+        optim_state = FSDP.optim_state_dict(model, optimizer)
+        model_state = model.state_dict()
+        weight_state_dict = {"model": model_state}
+        dist_cp.save_state_dict(
+            state_dict=weight_state_dict,
+            storage_writer=dist_cp.FileSystemWriter(model_weight_dir),
+            planner=DefaultSavePlanner(),
+        )
+        optimizer_state_dict = {"optimizer": optim_state}
+        dist_cp.save_state_dict(
+            state_dict=optimizer_state_dict,
+            storage_writer=dist_cp.FileSystemWriter(model_optimizer_dir),
+            planner=DefaultSavePlanner(),
+        )
+    discriminator_fsdp_state_dir = os.path.join(save_dir,
+                                                "discriminator_fsdp_state")
+    os.makedirs(discriminator_fsdp_state_dir, exist_ok=True)
+    with FSDP.state_dict_type(
+            discriminator,
+            StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(offload_to_cpu=True, rank0_only=True),
+            FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True),
+    ):
+        optim_state = FSDP.optim_state_dict(discriminator,
+                                            discriminator_optimizer)
+        model_state = discriminator.state_dict()
+        state_dict = {"optimizer": optim_state, "model": model_state}
+        if rank <= 0:
+            discriminator_fsdp_state_fil = os.path.join(
+                discriminator_fsdp_state_dir, "discriminator_state.pt")
+            torch.save(state_dict, discriminator_fsdp_state_fil)
+    main_print("--> saved FSDP state checkpoint")
+def load_sharded_model(model, optimizer, model_dir, optimizer_dir):
+    with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
+        weight_state_dict = {"model": model.state_dict()}
+        optim_state = load_sharded_optimizer_state_dict(
+            model_state_dict=weight_state_dict["model"],
+            optimizer_key="optimizer",
+            storage_reader=dist_cp.FileSystemReader(optimizer_dir),
+        )
+        optim_state = optim_state["optimizer"]
+        flattened_osd = FSDP.optim_state_dict_to_load(
+            model=model, optim=optimizer, optim_state_dict=optim_state)
+        optimizer.load_state_dict(flattened_osd)
+        dist_cp.load_state_dict(
+            state_dict=weight_state_dict,
+            storage_reader=dist_cp.FileSystemReader(model_dir),
+            planner=DefaultLoadPlanner(),
+        )
+        model_state = weight_state_dict["model"]
+        model.load_state_dict(model_state)
+    main_print(f"--> loaded model and optimizer from path {model_dir}")
+    return model, optimizer
+def load_full_state_model(model, optimizer, checkpoint_file, rank):
+    with FSDP.state_dict_type(
+            model,
+            StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(offload_to_cpu=True, rank0_only=True),
+            FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True),
+    ):
+        discriminator_state = torch.load(checkpoint_file)
+        model_state = discriminator_state["model"]
+        if rank <= 0:
+            optim_state = discriminator_state["optimizer"]
+        else:
+            optim_state = None
+        model.load_state_dict(model_state)
+        discriminator_optim_state = FSDP.optim_state_dict_to_load(
+            model=model, optim=optimizer, optim_state_dict=optim_state)
+        optimizer.load_state_dict(discriminator_optim_state)
+    main_print(
+        f"--> loaded discriminator and discriminator optimizer from path {checkpoint_file}"
+    )
+    return model, optimizer
+def resume_training_generator_discriminator(model, optimizer, discriminator,
+                                            discriminator_optimizer,
+                                            checkpoint_dir, rank):
+    step = int(checkpoint_dir.split("-")[-1])
+    model_weight_dir = os.path.join(checkpoint_dir, "model_weights_state")
+    model_optimizer_dir = os.path.join(checkpoint_dir, "model_optimizer_state")
+    model, optimizer = load_sharded_model(model, optimizer, model_weight_dir,
+                                          model_optimizer_dir)
+    discriminator_ckpt_file = os.path.join(checkpoint_dir,
+                                           "discriminator_fsdp_state",
+                                           "discriminator_state.pt")
+    discriminator, discriminator_optimizer = load_full_state_model(
+        discriminator, discriminator_optimizer, discriminator_ckpt_file, rank)
+    return model, optimizer, discriminator, discriminator_optimizer, step
+def resume_training(model, optimizer, checkpoint_dir, discriminator=False):
+    weight_path = os.path.join(checkpoint_dir,
+                               "diffusion_pytorch_model.safetensors")
+    if discriminator:
+        weight_path = os.path.join(checkpoint_dir,
+                                   "discriminator_pytorch_model.safetensors")
+    model_weights = load_file(weight_path)
+    with FSDP.state_dict_type(
+            model,
+            StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(offload_to_cpu=True, rank0_only=True),
+            FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True),
+    ):
+        current_state = model.state_dict()
+        current_state.update(model_weights)
+        model.load_state_dict(current_state, strict=False)
+    if discriminator:
+        optim_path = os.path.join(checkpoint_dir, "discriminator_optimizer.pt")
+    else:
+        optim_path = os.path.join(checkpoint_dir, "optimizer.pt")
+    optimizer_state_dict = torch.load(optim_path, weights_only=False)
+    optim_state = FSDP.optim_state_dict_to_load(
+        model=model, optim=optimizer, optim_state_dict=optimizer_state_dict)
+    optimizer.load_state_dict(optim_state)
+    step = int(checkpoint_dir.split("-")[-1])
+    return model, optimizer, step
+def save_lora_checkpoint(transformer, optimizer, rank, output_dir, step,
+                         pipeline, epoch):
+    with FSDP.state_dict_type(
+            transformer,
+            StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(offload_to_cpu=True, rank0_only=True),
+    ):
+        full_state_dict = transformer.state_dict()
+        lora_optim_state = FSDP.optim_state_dict(
+            transformer,
+            optimizer,
+        )
+    if rank <= 0:
+        save_dir = os.path.join(output_dir, f"lora-checkpoint-{step}-{epoch}")
+        os.makedirs(save_dir, exist_ok=True)
+        # save optimizer
+        optim_path = os.path.join(save_dir, "lora_optimizer.pt")
+        torch.save(lora_optim_state, optim_path)
+        # save lora weight
+        main_print(f"--> saving LoRA checkpoint at step {step}")
+        transformer_lora_layers = get_peft_model_state_dict(
+            model=transformer, state_dict=full_state_dict)
+        pipeline.save_lora_weights(
+            save_directory=save_dir,
+            transformer_lora_layers=transformer_lora_layers,
+            is_main_process=True,
+        )
+        # save config
+        lora_config = {
+            "step": step,
+            "lora_params": {
+                "lora_rank": transformer.config.lora_rank,
+                "lora_alpha": transformer.config.lora_alpha,
+                "target_modules": transformer.config.lora_target_modules,
+            },
+        }
+        config_path = os.path.join(save_dir, "lora_config.json")
+        with open(config_path, "w") as f:
+            json.dump(lora_config, f, indent=4)
+    main_print(f"--> LoRA checkpoint saved at step {step}")
+def resume_lora_optimizer(transformer, checkpoint_dir, optimizer):
+    config_path = os.path.join(checkpoint_dir, "lora_config.json")
+    with open(config_path, "r") as f:
+        config_dict = json.load(f)
+    optim_path = os.path.join(checkpoint_dir, "lora_optimizer.pt")
+    optimizer_state_dict = torch.load(optim_path, weights_only=False)
+    optim_state = FSDP.optim_state_dict_to_load(
+        model=transformer,
+        optim=optimizer,
+        optim_state_dict=optimizer_state_dict)
+    optimizer.load_state_dict(optim_state)
+    step = config_dict["step"]
+    main_print(f"-->  Successfully resuming LoRA optimizer from step {step}")
+    return transformer, optimizer, step

fastvideo/utils/communications.py ADDED Viewed

	@@ -0,0 +1,335 @@

+#This code file is from [https://github.com/hao-ai-lab/FastVideo], which is licensed under Apache License 2.0.
+from typing import Any, Tuple
+import torch
+import torch.distributed as dist
+from torch import Tensor
+from fastvideo.utils.parallel_states import nccl_info
+def broadcast(input_: torch.Tensor):
+    src = nccl_info.group_id * nccl_info.sp_size
+    dist.broadcast(input_, src=src, group=nccl_info.group)
+def _all_to_all_4D(input: torch.tensor,
+                   scatter_idx: int = 2,
+                   gather_idx: int = 1,
+                   group=None) -> torch.tensor:
+    """
+    all-to-all for QKV
+    Args:
+        input (torch.tensor): a tensor sharded along dim scatter dim
+        scatter_idx (int): default 1
+        gather_idx (int): default 2
+        group : torch process group
+    Returns:
+        torch.tensor: resharded tensor (bs, seqlen/P, hc, hs)
+    """
+    assert (
+        input.dim() == 4
+    ), f"input must be 4D tensor, got {input.dim()} and shape {input.shape}"
+    seq_world_size = dist.get_world_size(group)
+    if scatter_idx == 2 and gather_idx == 1:
+        # input (torch.tensor): a tensor sharded along dim 1 (bs, seqlen/P, hc, hs) output: (bs, seqlen, hc/P, hs)
+        bs, shard_seqlen, hc, hs = input.shape
+        seqlen = shard_seqlen * seq_world_size
+        shard_hc = hc // seq_world_size
+        # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
+        # (bs, seqlen/P, hc, hs) -reshape-> (bs, seq_len/P, P, hc/P, hs) -transpose(0,2)-> (P, seq_len/P, bs, hc/P, hs)
+        input_t = (input.reshape(bs, shard_seqlen, seq_world_size, shard_hc,
+                                 hs).transpose(0, 2).contiguous())
+        output = torch.empty_like(input_t)
+        # https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_to_all_single
+        # (P, seq_len/P, bs, hc/P, hs) scatter seqlen -all2all-> (P, seq_len/P, bs, hc/P, hs) scatter head
+        if seq_world_size > 1:
+            dist.all_to_all_single(output, input_t, group=group)
+            torch.cuda.synchronize()
+        else:
+            output = input_t
+        # if scattering the seq-dim, transpose the heads back to the original dimension
+        output = output.reshape(seqlen, bs, shard_hc, hs)
+        # (seq_len, bs, hc/P, hs) -reshape-> (bs, seq_len, hc/P, hs)
+        output = output.transpose(0, 1).contiguous().reshape(
+            bs, seqlen, shard_hc, hs)
+        return output
+    elif scatter_idx == 1 and gather_idx == 2:
+        # input (torch.tensor): a tensor sharded along dim 1 (bs, seqlen, hc/P, hs) output: (bs, seqlen/P, hc, hs)
+        bs, seqlen, shard_hc, hs = input.shape
+        hc = shard_hc * seq_world_size
+        shard_seqlen = seqlen // seq_world_size
+        seq_world_size = dist.get_world_size(group)
+        # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
+        # (bs, seqlen, hc/P, hs) -reshape-> (bs, P, seq_len/P, hc/P, hs) -transpose(0, 3)-> (hc/P, P, seqlen/P, bs, hs) -transpose(0, 1) -> (P, hc/P, seqlen/P, bs, hs)
+        input_t = (input.reshape(
+            bs, seq_world_size, shard_seqlen, shard_hc,
+            hs).transpose(0, 3).transpose(0, 1).contiguous().reshape(
+                seq_world_size, shard_hc, shard_seqlen, bs, hs))
+        output = torch.empty_like(input_t)
+        # https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_to_all_single
+        # (P, bs x hc/P, seqlen/P, hs) scatter seqlen -all2all-> (P, bs x seq_len/P, hc/P, hs) scatter head
+        if seq_world_size > 1:
+            dist.all_to_all_single(output, input_t, group=group)
+            torch.cuda.synchronize()
+        else:
+            output = input_t
+        # if scattering the seq-dim, transpose the heads back to the original dimension
+        output = output.reshape(hc, shard_seqlen, bs, hs)
+        # (hc, seqlen/N, bs, hs) -tranpose(0,2)-> (bs, seqlen/N, hc, hs)
+        output = output.transpose(0, 2).contiguous().reshape(
+            bs, shard_seqlen, hc, hs)
+        return output
+    else:
+        raise RuntimeError(
+            "scatter_idx must be 1 or 2 and gather_idx must be 1 or 2")
+class SeqAllToAll4D(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,
+        group: dist.ProcessGroup,
+        input: Tensor,
+        scatter_idx: int,
+        gather_idx: int,
+    ) -> Tensor:
+        ctx.group = group
+        ctx.scatter_idx = scatter_idx
+        ctx.gather_idx = gather_idx
+        return _all_to_all_4D(input, scatter_idx, gather_idx, group=group)
+    @staticmethod
+    def backward(ctx: Any,
+                 *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
+        return (
+            None,
+            SeqAllToAll4D.apply(ctx.group, *grad_output, ctx.gather_idx,
+                                ctx.scatter_idx),
+            None,
+            None,
+        )
+def all_to_all_4D(
+    input_: torch.Tensor,
+    scatter_dim: int = 2,
+    gather_dim: int = 1,
+):
+    return SeqAllToAll4D.apply(nccl_info.group, input_, scatter_dim,
+                               gather_dim)
+def _all_to_all(
+    input_: torch.Tensor,
+    world_size: int,
+    group: dist.ProcessGroup,
+    scatter_dim: int,
+    gather_dim: int,
+):
+    input_list = [
+        t.contiguous()
+        for t in torch.tensor_split(input_, world_size, scatter_dim)
+    ]
+    output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
+    dist.all_to_all(output_list, input_list, group=group)
+    return torch.cat(output_list, dim=gather_dim).contiguous()
+class _AllToAll(torch.autograd.Function):
+    """All-to-all communication.
+    Args:
+        input_: input matrix
+        process_group: communication group
+        scatter_dim: scatter dimension
+        gather_dim: gather dimension
+    """
+    @staticmethod
+    def forward(ctx, input_, process_group, scatter_dim, gather_dim):
+        ctx.process_group = process_group
+        ctx.scatter_dim = scatter_dim
+        ctx.gather_dim = gather_dim
+        ctx.world_size = dist.get_world_size(process_group)
+        output = _all_to_all(input_, ctx.world_size, process_group,
+                             scatter_dim, gather_dim)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = _all_to_all(
+            grad_output,
+            ctx.world_size,
+            ctx.process_group,
+            ctx.gather_dim,
+            ctx.scatter_dim,
+        )
+        return (
+            grad_output,
+            None,
+            None,
+            None,
+        )
+def all_to_all(
+    input_: torch.Tensor,
+    scatter_dim: int = 2,
+    gather_dim: int = 1,
+):
+    return _AllToAll.apply(input_, nccl_info.group, scatter_dim, gather_dim)
+class _AllGather(torch.autograd.Function):
+    """All-gather communication with autograd support.
+    Args:
+        input_: input tensor
+        dim: dimension along which to concatenate
+    """
+    @staticmethod
+    def forward(ctx, input_, dim):
+        ctx.dim = dim
+        world_size = nccl_info.sp_size
+        group = nccl_info.group
+        input_size = list(input_.size())
+        ctx.input_size = input_size[dim]
+        tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+        input_ = input_.contiguous()
+        dist.all_gather(tensor_list, input_, group=group)
+        output = torch.cat(tensor_list, dim=dim)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        world_size = nccl_info.sp_size
+        rank = nccl_info.rank_within_group
+        dim = ctx.dim
+        input_size = ctx.input_size
+        sizes = [input_size] * world_size
+        grad_input_list = torch.split(grad_output, sizes, dim=dim)
+        grad_input = grad_input_list[rank]
+        return grad_input, None
+def all_gather(input_: torch.Tensor, dim: int = 1):
+    """Performs an all-gather operation on the input tensor along the specified dimension.
+    Args:
+        input_ (torch.Tensor): Input tensor of shape [B, H, S, D].
+        dim (int, optional): Dimension along which to concatenate. Defaults to 1.
+    Returns:
+        torch.Tensor: Output tensor after all-gather operation, concatenated along 'dim'.
+    """
+    return _AllGather.apply(input_, dim)
+def prepare_sequence_parallel_data(
+    encoder_hidden_states, encoder_attention_mask, caption
+):
+    if nccl_info.sp_size == 1:
+        return (
+            encoder_hidden_states,
+            encoder_attention_mask,
+            caption,
+        )
+    def prepare(
+        encoder_hidden_states, encoder_attention_mask, caption
+    ):
+        #hidden_states = all_to_all(hidden_states, scatter_dim=2, gather_dim=0)
+        encoder_hidden_states = all_to_all(
+            encoder_hidden_states, scatter_dim=1, gather_dim=0
+        )
+        #attention_mask = all_to_all(attention_mask, scatter_dim=1, gather_dim=0)
+        encoder_attention_mask = all_to_all(
+            encoder_attention_mask, scatter_dim=1, gather_dim=0
+        )
+        return (
+            encoder_hidden_states,
+            encoder_attention_mask,
+            caption
+        )
+    sp_size = nccl_info.sp_size
+    #frame = hidden_states.shape[2]
+    #assert frame % sp_size == 0, "frame should be a multiple of sp_size"
+    (
+        #hidden_states,
+        encoder_hidden_states,
+        #attention_mask,
+        encoder_attention_mask,
+        caption,
+    ) = prepare(
+        #hidden_states,
+        encoder_hidden_states.repeat(1, sp_size, 1),
+        #attention_mask.repeat(1, sp_size, 1, 1),
+        encoder_attention_mask.repeat(1, sp_size),
+        caption,
+    )
+    return encoder_hidden_states, encoder_attention_mask, caption
+def sp_parallel_dataloader_wrapper(
+    dataloader, device, train_batch_size, sp_size, train_sp_batch_size
+):
+    while True:
+        for data_item in dataloader:
+            cond, cond_mask, caption = data_item
+            #latents = latents.to(device)
+            cond = cond.to(device)
+            #attn_mask = attn_mask.to(device)
+            cond_mask = cond_mask.to(device)
+            #frame = latents.shape[2]
+            frame = 19
+            if frame == 1:
+                yield cond, cond_mask, caption
+            else:
+                cond, cond_mask, caption = prepare_sequence_parallel_data(
+                    cond, cond_mask, caption
+                )
+                assert (
+                    train_batch_size * sp_size >= train_sp_batch_size
+                ), "train_batch_size * sp_size should be greater than train_sp_batch_size"
+                for iter in range(train_batch_size * sp_size // train_sp_batch_size):
+                    st_idx = iter * train_sp_batch_size
+                    ed_idx = (iter + 1) * train_sp_batch_size
+                    encoder_hidden_states = cond[st_idx:ed_idx]
+                    #attention_mask = attn_mask[st_idx:ed_idx]
+                    encoder_attention_mask = cond_mask[st_idx:ed_idx]
+                    yield (
+                        #latents[st_idx:ed_idx],
+                        encoder_hidden_states,
+                        #attention_mask,
+                        encoder_attention_mask,
+                        caption
+                    )