File size: 42,084 Bytes

3d1c0e1

# Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL.ImageOps import exif_transpose
from PIL import Image
import io
import pyarrow.parquet as pq
import random
import bisect
import pyarrow.fs as fs
import csv
import numpy as np
import logging

logger = logging.getLogger(__name__)

@torch.no_grad()
def tokenize_prompt(tokenizer, prompt, text_encoder_architecture='open_clip'): # support open_clip, CLIP, T5/UMT5
    if text_encoder_architecture == 'CLIP' or text_encoder_architecture == 'open_clip':
        return tokenizer(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=77,
            return_tensors="pt",
        ).input_ids
    elif text_encoder_architecture in ['umt5-base', 'umt5-xxl', 't5']:
        # T5/UMT5 tokenizer
        return tokenizer(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt",
        ).input_ids
    elif text_encoder_architecture == 'CLIP_T5_base': # we have two tokenizers, 1st for CLIP, 2nd for T5
        input_ids = []
        input_ids.append(tokenizer[0](
            prompt,
            truncation=True,
            padding="max_length",
            max_length=77,
            return_tensors="pt",
        ).input_ids)
        input_ids.append(tokenizer[1](
            prompt,
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt",
        ).input_ids)
        return input_ids
    else:
        raise ValueError(f"Unknown text_encoder_architecture: {text_encoder_architecture}")

def encode_prompt(text_encoder, input_ids, text_encoder_architecture='open_clip'):  # support open_clip, CLIP, T5/UMT5
    if text_encoder_architecture == 'CLIP' or text_encoder_architecture == 'open_clip':
        outputs = text_encoder(input_ids=input_ids, return_dict=True, output_hidden_states=True)
        encoder_hidden_states = outputs.hidden_states[-2]
        cond_embeds = outputs[0]
        return encoder_hidden_states, cond_embeds
    elif text_encoder_architecture in ['umt5-base', 'umt5-xxl', 't5']:
        # T5/UMT5 encoder - only returns encoder_hidden_states, no pooled projection
        outputs = text_encoder(input_ids=input_ids, return_dict=True)
        encoder_hidden_states = outputs.last_hidden_state
        # For T5, we don't have a pooled projection, so return None or a dummy tensor
        # The video pipeline doesn't use cond_embeds, so we can return None
        cond_embeds = None
        return encoder_hidden_states, cond_embeds
    elif text_encoder_architecture == 'CLIP_T5_base':
        outputs_clip = text_encoder[0](input_ids=input_ids[0], return_dict=True, output_hidden_states=True)
        outputs_t5 = text_encoder[1](input_ids=input_ids[1], decoder_input_ids=torch.zeros_like(input_ids[1]),
                               return_dict=True, output_hidden_states=True)
        encoder_hidden_states = outputs_t5.encoder_hidden_states[-2]
        cond_embeds = outputs_clip[0]
        return encoder_hidden_states, cond_embeds
    else:
        raise ValueError(f"Unknown text_encoder_architecture: {text_encoder_architecture}")


def process_image(image, size, Norm=False, hps_score = 6.0): 
    image = exif_transpose(image)

    if not image.mode == "RGB":
        image = image.convert("RGB")

    orig_height = image.height
    orig_width = image.width

    image = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)(image)

    c_top, c_left, _, _ = transforms.RandomCrop.get_params(image, output_size=(size, size))
    image = transforms.functional.crop(image, c_top, c_left, size, size)
    image = transforms.ToTensor()(image)

    if Norm:
        image = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)(image)

    micro_conds = torch.tensor(
        [orig_width, orig_height, c_top, c_left, hps_score],
    )

    return {"image": image, "micro_conds": micro_conds}


class MyParquetDataset(Dataset):
    def __init__(self, root_dir, tokenizer=None, size=512,
                 text_encoder_architecture='CLIP', norm=False):
        random.seed(23)

        self.root_dir = root_dir
        self.dataset_receipt = {'MSCOCO_part1': {'total_num': 6212, 'ratio':1}, 'MSCOCO_part2': {'total_num': 6212, 'ratio':1}}

        self.tokenizer = tokenizer
        self.size = size
        self.text_encoder_architecture = text_encoder_architecture
        self.norm = norm

        self.hdfs = fs.HadoopFileSystem(host="", port=0000) # TODO: change to your own HDFS host and port
        self._init_mixed_parquet_dir_list()

        self.file_metadata = []
        self.cumulative_sizes = [0]
        total = 0
        for path in self.parquet_files:
            try:
                with pq.ParquetFile(path, filesystem=self.hdfs) as pf:
                    num_rows = pf.metadata.num_rows
                    self.file_metadata.append({
                        'path': path,
                        'num_rows': num_rows,
                        'global_offset': total
                    })
                    total += num_rows
                    self.cumulative_sizes.append(total)
            except Exception as e:
                print(f"Error processing {path}: {str(e)}")
                continue

        # init cache
        self.current_file = None
        self.cached_data = None
        self.cached_file_index = -1

    def _init_mixed_parquet_dir_list(self):
        print('Loading parquet files, please be patient...')
        self.parquet_files = []

        for key, value in self.dataset_receipt.items():
            # Generate a list of standard Parquet file paths, lazy load
            hdfs_path = os.path.join(self.root_dir, key)

            num = value['total_num']
            sampled_list = random.sample(
                [f"{hdfs_path}/train-{idx:05d}-of-{num:05d}.parquet" for idx in range(num)],
                k=int(num * value['ratio'])
            )
            self.parquet_files += sampled_list

    def __len__(self):
        return self.cumulative_sizes[-1]

    def _locate_file(self, global_idx):
        # Use binary search to quickly locate files
        file_index = bisect.bisect_right(self.cumulative_sizes, global_idx) - 1
        if file_index < 0 or file_index >= len(self.file_metadata):
            raise IndexError(f"Index {global_idx} out of range")

        file_info = self.file_metadata[file_index]
        local_idx = global_idx - file_info['global_offset']
        return file_index, local_idx

    def _load_file(self, file_index):
        """Load Parquet files into cache on demand"""
        if self.cached_file_index != file_index:
            file_info = self.file_metadata[file_index]
            try:
                table = pq.read_table(file_info['path'], filesystem=self.hdfs)
                self.cached_data = table.to_pydict()
                self.cached_file_index = file_index
            except Exception as e:
                print(f"Error loading {file_info['path']}: {str(e)}")
                raise

    def __getitem__(self, idx):
        file_index, local_idx = self._locate_file(idx)
        self._load_file(file_index)
        sample = {k: v[local_idx] for k, v in self.cached_data.items()}

        # cprint(sample.keys(), 'red')
        generated_caption, image_path = sample['task2'], sample['image']  # only suitable for my data
        instance_image = Image.open(io.BytesIO(image_path['bytes']))

        # if instance_image.width < self.size or instance_image.height < self.size:
        #     raise ValueError(f"Image at {image_path} is too small")

        rv = process_image(instance_image, self.size, self.norm)

        if isinstance(self.tokenizer, list):
            _tmp_ = tokenize_prompt(self.tokenizer, generated_caption, self.text_encoder_architecture)
            rv["prompt_input_ids"] = [_tmp_[0][0], _tmp_[1][0]]
        else:
            rv["prompt_input_ids"] = tokenize_prompt(self.tokenizer, generated_caption, self.text_encoder_architecture)[
                0]

        return rv

class HuggingFaceDataset(Dataset):
    def __init__(
        self,
        hf_dataset,
        tokenizer,
        image_key,
        prompt_key,
        prompt_prefix=None,
        size=512,
        text_encoder_architecture='CLIP',
    ):
        self.size = size
        self.image_key = image_key
        self.prompt_key = prompt_key
        self.tokenizer = tokenizer
        self.hf_dataset = hf_dataset
        self.prompt_prefix = prompt_prefix
        self.text_encoder_architecture = text_encoder_architecture

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, index):
        item = self.hf_dataset[index]

        rv = process_image(item[self.image_key], self.size)

        prompt = item[self.prompt_key]

        if self.prompt_prefix is not None:
            prompt = self.prompt_prefix + prompt

        if isinstance(self.tokenizer, list):
            _tmp_ = tokenize_prompt(self.tokenizer, prompt, self.text_encoder_architecture)
            rv["prompt_input_ids"] = [_tmp_[0][0],_tmp_[1][0]]
        else:
            rv["prompt_input_ids"] = tokenize_prompt(self.tokenizer, prompt, self.text_encoder_architecture)[0]

        return rv


def process_video(video_tensor, num_frames, height, width, use_random_crop=True):
    """
    Process video tensor for training.
    
    Uses aspect-ratio preserving resize + crop to avoid distortion.
    
    Args:
        video_tensor: Video tensor of shape [C, F, H, W] or [F, H, W, C]
        num_frames: Target number of frames
        height: Target height
        width: Target width
        use_random_crop: If True, use random crop (for training). If False, use center crop (for validation/feature extraction)
    
    Returns:
        Processed video tensor of shape [C, F, H, W] in [0, 1] range
    """
    # Ensure video is in [C, F, H, W] format
    if video_tensor.dim() == 4:
        if video_tensor.shape[0] == 3 or video_tensor.shape[0] == 1:
            # Already in [C, F, H, W] format
            pass
        elif video_tensor.shape[-1] == 3 or video_tensor.shape[-1] == 1:
            # [F, H, W, C] -> [C, F, H, W]
            video_tensor = video_tensor.permute(3, 0, 1, 2)
        else:
            raise ValueError(f"Unexpected video tensor shape: {video_tensor.shape}")
    
    # Normalize to [0, 1] if needed
    if video_tensor.max() > 1.0:
        video_tensor = video_tensor / 255.0
    
    C, F, H, W = video_tensor.shape
    
    # Temporal resampling: ensure exactly num_frames frames
    if F != num_frames:
        if F < num_frames:
            # If video is shorter, pad by repeating the last frame
            num_pad = num_frames - F
            last_frame = video_tensor[:, -1:, :, :]  # [C, 1, H, W]
            padding = last_frame.repeat(1, num_pad, 1, 1)  # [C, num_pad, H, W]
            video_tensor = torch.cat([video_tensor, padding], dim=1)  # [C, num_frames, H, W]
            F = num_frames
        else:
            # If video is longer, randomly select a continuous segment of num_frames
            max_start = F - num_frames
            start_idx = random.randint(0, max_start)
            indices = torch.arange(start_idx, start_idx + num_frames)
            video_tensor = video_tensor[:, indices, :, :]
            F = num_frames  # Update F after temporal resampling
    
    # Spatial resizing: aspect-ratio preserving resize + crop
    if H != height or W != width:
        # Step 1: Aspect-ratio preserving resize
        # Calculate scale factors for both dimensions
        scale_h = height / H
        scale_w = width / W
        
        # Use the larger scale to ensure both dimensions are at least as large as target
        # This way, after resize, we can crop to exact target size
        scale = max(scale_h, scale_w)
        
        # Calculate new dimensions maintaining aspect ratio
        new_H = int(H * scale)
        new_W = int(W * scale)
        
        # Ensure we have at least the target size (handle rounding)
        if new_H < height:
            new_H = height
        if new_W < width:
            new_W = width
        
        # Resize maintaining aspect ratio
        # Process each frame: [C, F, H, W] -> reshape to [C*F, 1, H, W] for interpolation
        video_tensor = torch.nn.functional.interpolate(
            video_tensor.reshape(C * F, 1, H, W),
            size=(new_H, new_W),
            mode='bilinear',
            align_corners=False
        ).reshape(C, F, new_H, new_W)
        
        # Step 2: Crop to target size (height, width)
        # Calculate crop coordinates
        if use_random_crop:
            # Random crop for training (data augmentation)
            max_h = new_H - height
            max_w = new_W - width
            if max_h < 0 or max_w < 0:
                # If resized image is smaller than target, pad instead
                pad_h = max(0, height - new_H)
                pad_w = max(0, width - new_W)
                video_tensor = torch.nn.functional.pad(
                    video_tensor,
                    (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2),
                    mode='constant',
                    value=0
                )
                # If still not exact size, crop or pad
                if video_tensor.shape[2] != height or video_tensor.shape[3] != width:
                    video_tensor = torch.nn.functional.interpolate(
                        video_tensor.reshape(C * F, 1, video_tensor.shape[2], video_tensor.shape[3]),
                        size=(height, width),
                        mode='bilinear',
                        align_corners=False
                    ).reshape(C, F, height, width)
            else:
                crop_h = random.randint(0, max_h)
                crop_w = random.randint(0, max_w)
                video_tensor = video_tensor[:, :, crop_h:crop_h + height, crop_w:crop_w + width]
        else:
            # Center crop for validation/feature extraction (deterministic)
            crop_h = (new_H - height) // 2
            crop_w = (new_W - width) // 2
            if crop_h < 0 or crop_w < 0:
                # If resized image is smaller than target, pad instead
                pad_h = max(0, height - new_H)
                pad_w = max(0, width - new_W)
                video_tensor = torch.nn.functional.pad(
                    video_tensor,
                    (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2),
                    mode='constant',
                    value=0
                )
                # If still not exact size, crop or pad
                if video_tensor.shape[2] != height or video_tensor.shape[3] != width:
                    video_tensor = torch.nn.functional.interpolate(
                        video_tensor.reshape(C * F, 1, video_tensor.shape[2], video_tensor.shape[3]),
                        size=(height, width),
                        mode='bilinear',
                        align_corners=False
                    ).reshape(C, F, height, width)
            else:
                video_tensor = video_tensor[:, :, crop_h:crop_h + height, crop_w:crop_w + width]
    
    # Final verification: ensure output has exactly the expected shape
    C, F, H, W = video_tensor.shape
    assert F == num_frames, f"Frame count mismatch: expected {num_frames}, got {F}"
    assert H == height, f"Height mismatch: expected {height}, got {H}"
    assert W == width, f"Width mismatch: expected {width}, got {W}"
    
    return video_tensor


class VideoDataset(Dataset):
    """
    Dataset for video training, compatible with HuggingFace datasets format.
    Supports OpenVid1M and similar video-text datasets.
    """
    def __init__(
        self,
        hf_dataset,
        tokenizer,
        video_key="video",
        prompt_key="caption",
        prompt_prefix=None,
        num_frames=16,
        height=480,
        width=848,
        text_encoder_architecture='umt5-base',
        use_random_crop=True,  # Random crop for training, center crop for validation
    ):
        self.hf_dataset = hf_dataset
        self.tokenizer = tokenizer
        self.video_key = video_key
        self.prompt_key = prompt_key
        self.prompt_prefix = prompt_prefix
        self.num_frames = num_frames
        self.height = height
        self.width = width
        self.text_encoder_architecture = text_encoder_architecture
        self.use_random_crop = use_random_crop

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, index):
        item = self.hf_dataset[index]
        
        # Load video
        video = item[self.video_key]
        
        # Convert to tensor if needed (handle different formats)
        if isinstance(video, list):
            # List of PIL Images or tensors
            frames = []
            for frame in video:
                if isinstance(frame, Image.Image):
                    frame = transforms.ToTensor()(frame)
                frames.append(frame)
            video_tensor = torch.stack(frames, dim=1)  # [C, F, H, W]
        elif isinstance(video, torch.Tensor):
            video_tensor = video
        else:
            raise ValueError(f"Unsupported video type: {type(video)}")
        
        # Process video
        video_tensor = process_video(video_tensor, self.num_frames, self.height, self.width)
        
        # Ensure video tensor has exactly the expected shape
        C, F, H, W = video_tensor.shape
        if F != self.num_frames or H != self.height or W != self.width:
            # If shape doesn't match, create a properly sized tensor
            video_tensor = torch.nn.functional.interpolate(
                video_tensor.reshape(C * F, 1, H, W),
                size=(self.height, self.width),
                mode='bilinear',
                align_corners=False
            ).reshape(C, F, self.height, self.width)
            # Ensure exactly num_frames
            if F < self.num_frames:
                # Pad by repeating last frame
                num_pad = self.num_frames - F
                last_frame = video_tensor[:, -1:, :, :]
                padding = last_frame.repeat(1, num_pad, 1, 1)
                video_tensor = torch.cat([video_tensor, padding], dim=1)
            elif F > self.num_frames:
                # Crop to num_frames
                video_tensor = video_tensor[:, :self.num_frames, :, :]
        
        # Clone to ensure storage is resizable (required for DataLoader collate)
        video_tensor = video_tensor.contiguous().clone()
        
        # Process prompt
        prompt = item[self.prompt_key]
        if self.prompt_prefix is not None:
            prompt = self.prompt_prefix + prompt
        
        prompt_input_ids = tokenize_prompt(self.tokenizer, prompt, self.text_encoder_architecture)[0]
        # Clone to ensure storage is resizable
        prompt_input_ids = prompt_input_ids.clone()
        
        rv = {
            "video": video_tensor,  # [C, num_frames, height, width], guaranteed shape
            "prompt_input_ids": prompt_input_ids
        }
        
        return rv


class OpenVid1MDataset(Dataset):
    """
    Dataset for OpenVid1M video-text pairs from CSV file.
    
    CSV format:
        video,caption,aesthetic score,motion score,temporal consistency score,camera motion,frame,fps,seconds,new_id
    
    Returns:
        dict with keys:
            - "video": torch.Tensor of shape [C, F, H, W] in [0, 1] range
            - "prompt_input_ids": torch.Tensor of tokenized prompt
    """
    def __init__(
        self,
        csv_path,
        video_root_dir,
        tokenizer,
        num_frames=16,
        height=480,
        width=848,
        text_encoder_architecture='umt5-base',
        prompt_prefix=None,
        use_random_temporal_crop=True,  # If False, always sample from the beginning
        use_random_crop=True,  # Random crop for training, center crop for validation/feature extraction
    ):
        """
        Args:
            csv_path: Path to the CSV file containing video metadata
            video_root_dir: Root directory where video files are stored
            tokenizer: Text tokenizer
            num_frames: Target number of frames to extract
            height: Target height
            width: Target width
            text_encoder_architecture: Architecture of text encoder
            prompt_prefix: Optional prefix to add to prompts
        """
        self.csv_path = csv_path
        self.video_root_dir = video_root_dir
        self.tokenizer = tokenizer
        self.num_frames = num_frames
        self.height = height
        self.width = width
        self.text_encoder_architecture = text_encoder_architecture
        self.prompt_prefix = prompt_prefix
        self.use_random_temporal_crop = use_random_temporal_crop
        self.use_random_crop = use_random_crop
        
        # Load CSV data
        self.data = []
        with open(csv_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.data.append(row)
        
        logger.info(f"Loaded {len(self.data)} video entries from {csv_path}")
        
        # Try to import video loading library
        self.video_loader = None
        try:
            import decord
            decord.bridge.set_bridge('torch')
            self.video_loader = 'decord'
            logger.info("Using decord for video loading")
        except ImportError:
            try:
                import av
                self.video_loader = 'av'
                logger.info("Using PyAV for video loading")
            except ImportError:
                try:
                    import cv2
                    self.video_loader = 'cv2'
                    logger.info("Using OpenCV for video loading")
                except ImportError:
                    raise ImportError(
                        "No video loading library found. Please install one of: "
                        "decord (pip install decord), PyAV (pip install av), or opencv-python (pip install opencv-python)"
                    )

    def __len__(self):
        return len(self.data)

    def _load_video_decord(self, video_path):
        """Load video using decord"""
        import decord
        vr = decord.VideoReader(video_path, ctx=decord.cpu(0))
        total_frames = len(vr)
        
        # Sample frames: random temporal crop (continuous segment) for better temporal coherence
        if total_frames <= self.num_frames:
            indices = list(range(total_frames))
        else:
            if self.use_random_temporal_crop:
                # Randomly select a continuous segment of num_frames
                max_start = total_frames - self.num_frames
                start_idx = random.randint(0, max_start)
            else:
                # Fixed sampling: always start from the beginning
                start_idx = 0
            indices = list(range(start_idx, start_idx + self.num_frames))
        
        frames = vr.get_batch(indices)  # [F, H, W, C] in uint8
        # If using torch bridge, frames is already a torch Tensor
        if isinstance(frames, torch.Tensor):
            frames = frames.float()  # [F, H, W, C]
        else:
            # Use torch.tensor() instead of torch.from_numpy() to ensure a complete copy
            # This avoids "Trying to resize storage that is not resizable" errors in DataLoader collate
            frames = torch.tensor(frames, dtype=torch.float32)  # [F, H, W, C], fully copied
        frames = frames.permute(3, 0, 1, 2)  # [C, F, H, W]
        frames = frames / 255.0  # Normalize to [0, 1]
        
        return frames

    def _load_video_av(self, video_path):
        """Load video using PyAV"""
        import av
        container = av.open(video_path)
        frames = []
        
        # Get video stream
        video_stream = container.streams.video[0]
        total_frames = video_stream.frames if video_stream.frames > 0 else None
        
        # Sample frames: random temporal crop (continuous segment) for better temporal coherence
        if total_frames is None:
            # If we can't get frame count, decode all frames and sample
            frame_list = []
            for frame in container.decode(video_stream):
                frame_list.append(frame)
            total_frames = len(frame_list)
            if total_frames <= self.num_frames:
                frame_indices = list(range(total_frames))
            else:
                if self.use_random_temporal_crop:
                    # Randomly select a continuous segment of num_frames
                    max_start = total_frames - self.num_frames
                    start_idx = random.randint(0, max_start)
                else:
                    # Fixed sampling: always start from the beginning
                    start_idx = 0
                frame_indices = list(range(start_idx, start_idx + self.num_frames))
            frames = [transforms.ToTensor()(frame_list[i].to_image()) for i in frame_indices]
        else:
            if total_frames <= self.num_frames:
                frame_indices = list(range(total_frames))
            else:
                if self.use_random_temporal_crop:
                    # Randomly select a continuous segment of num_frames
                    max_start = total_frames - self.num_frames
                    start_idx = random.randint(0, max_start)
                else:
                    # Fixed sampling: always start from the beginning
                    start_idx = 0
                frame_indices = list(range(start_idx, start_idx + self.num_frames))
            
            frame_idx = 0
            for frame in container.decode(video_stream):
                if frame_idx in frame_indices:
                    img = frame.to_image()  # PIL Image
                    img_tensor = transforms.ToTensor()(img)  # [C, H, W]
                    frames.append(img_tensor)
                    if len(frames) >= self.num_frames:
                        break
                frame_idx += 1
        
        container.close()
        
        if len(frames) == 0:
            raise ValueError(f"No frames extracted from {video_path}")
        
        # Stack frames: [C, F, H, W]
        video_tensor = torch.stack(frames, dim=1)
        
        # Pad if needed
        if video_tensor.shape[1] < self.num_frames:
            padding = torch.zeros(
                video_tensor.shape[0], 
                self.num_frames - video_tensor.shape[1],
                video_tensor.shape[2],
                video_tensor.shape[3]
            )
            video_tensor = torch.cat([video_tensor, padding], dim=1)
        
        return video_tensor

    def _load_video_cv2(self, video_path):
        """Load video using OpenCV"""
        import cv2
        cap = cv2.VideoCapture(video_path)
        frames = []
        
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Sample frames: random temporal crop (continuous segment) for better temporal coherence
        if total_frames <= self.num_frames:
            frame_indices = list(range(total_frames))
        else:
            if self.use_random_temporal_crop:
                # Randomly select a continuous segment of num_frames
                max_start = total_frames - self.num_frames
                start_idx = random.randint(0, max_start)
            else:
                # Fixed sampling: always start from the beginning
                start_idx = 0
            frame_indices = list(range(start_idx, start_idx + self.num_frames))
        
        frame_idx = 0
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if frame_idx in frame_indices:
                # Convert BGR to RGB
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                # Convert to tensor [C, H, W] and normalize to [0, 1]
                # Use torch.tensor() instead of torch.from_numpy() to ensure a complete copy
                # This avoids "Trying to resize storage that is not resizable" errors in DataLoader collate
                frame_tensor = torch.tensor(frame_rgb, dtype=torch.float32).permute(2, 0, 1) / 255.0
                frames.append(frame_tensor)
                if len(frames) >= self.num_frames:
                    break
            frame_idx += 1
        
        cap.release()
        
        if len(frames) == 0:
            raise ValueError(f"No frames extracted from {video_path}")
        
        # Stack frames: [C, F, H, W]
        video_tensor = torch.stack(frames, dim=1)
        
        # Pad if needed
        if video_tensor.shape[1] < self.num_frames:
            padding = torch.zeros(
                video_tensor.shape[0], 
                self.num_frames - video_tensor.shape[1],
                video_tensor.shape[2],
                video_tensor.shape[3]
            )
            video_tensor = torch.cat([video_tensor, padding], dim=1)
        
        return video_tensor

    def _load_video(self, video_path):
        """Load video from path using the available video loader"""
        full_path = os.path.join(self.video_root_dir, video_path)
        
        if not os.path.exists(full_path):
            raise FileNotFoundError(f"Video file not found: {full_path}")
        
        if self.video_loader == 'decord':
            return self._load_video_decord(full_path)
        elif self.video_loader == 'av':
            return self._load_video_av(full_path)
        elif self.video_loader == 'cv2':
            return self._load_video_cv2(full_path)
        else:
            raise ValueError(f"Unknown video loader: {self.video_loader}")

    def __getitem__(self, index):
        row = self.data[index]
        
        # Load video
        video_path = row['video']
        try:
            video_tensor = self._load_video(video_path)
        except Exception as e:
            # If video loading fails, return a zero tensor and log error
            logger.warning(f"Failed to load video {video_path}: {e}")
            video_tensor = torch.zeros(3, self.num_frames, self.height, self.width)
        
        # Process video: aspect-ratio preserving resize + crop to target dimensions
        video_tensor = process_video(video_tensor, self.num_frames, self.height, self.width, use_random_crop=self.use_random_crop)
        
        # Ensure video tensor has exactly the expected shape
        C, F, H, W = video_tensor.shape
        if F != self.num_frames or H != self.height or W != self.width:
            # If shape doesn't match, create a properly sized tensor
            video_tensor = torch.nn.functional.interpolate(
                video_tensor.reshape(C * F, 1, H, W),
                size=(self.height, self.width),
                mode='bilinear',
                align_corners=False
            ).reshape(C, F, self.height, self.width)
            # Ensure exactly num_frames
            if F < self.num_frames:
                # Pad by repeating last frame
                num_pad = self.num_frames - F
                last_frame = video_tensor[:, -1:, :, :]
                padding = last_frame.repeat(1, num_pad, 1, 1)
                video_tensor = torch.cat([video_tensor, padding], dim=1)
            elif F > self.num_frames:
                # Crop to num_frames
                video_tensor = video_tensor[:, :self.num_frames, :, :]
        
        # Clone to ensure storage is resizable (required for DataLoader collate)
        video_tensor = video_tensor.contiguous().clone()
        
        # Process prompt
        prompt = row['caption']
        if self.prompt_prefix is not None:
            prompt = self.prompt_prefix + prompt
        
        prompt_input_ids = tokenize_prompt(self.tokenizer, prompt, self.text_encoder_architecture)[0]
        # Clone to ensure storage is resizable
        prompt_input_ids = prompt_input_ids.clone()
        
        return {
            "video": video_tensor,  # [C, num_frames, height, width], guaranteed shape
            "prompt_input_ids": prompt_input_ids
        }


class TinyOpenVid1MDataset(OpenVid1MDataset):
    """
    A tiny subset of OpenVid1MDataset for overfitting experiments.
    Only takes the first N samples from the full dataset.
    """
    def __init__(
        self,
        csv_path,
        video_root_dir=None,
        tokenizer=None,
        num_frames=16,
        height=480,
        width=848,
        text_encoder_architecture='umt5-base',
        prompt_prefix=None,
        max_samples=256,  # Only use first N samples
        seed=42,  # Fixed seed for reproducibility
    ):
        """
        Args:
            max_samples: Maximum number of samples to use (default: 256)
            seed: Random seed for reproducibility (default: 42)
        """
        # Initialize parent class
        super().__init__(
            csv_path=csv_path,
            video_root_dir=video_root_dir,
            tokenizer=tokenizer,
            num_frames=num_frames,
            height=height,
            width=width,
            text_encoder_architecture=text_encoder_architecture,
            prompt_prefix=prompt_prefix,
        )
        
        # Limit to first max_samples
        original_len = len(self.data)
        if original_len > max_samples:
            # Use fixed seed to ensure reproducibility
            import random
            random.seed(seed)
            # Shuffle with fixed seed, then take first max_samples
            indices = list(range(original_len))
            random.shuffle(indices)
            self.data = [self.data[i] for i in indices[:max_samples]]
            logger.info(f"Limited dataset to {max_samples} samples (from {original_len} total) for overfitting experiment")
        else:
            logger.info(f"Using all {len(self.data)} samples (less than max_samples={max_samples})")


def get_hierarchical_path(base_dir, index):
    """
    Get hierarchical path for loading features from 3-level directory structure.
    
    Structure: base_dir/level1/level2/level3/filename.npy
    - level1: index // 1000000 (0-999)
    - level2: (index // 1000) % 1000 (0-999)
    - level3: index % 1000 (0-999)
    
    Args:
        base_dir: Base directory for features
        index: Sample index
        
    Returns:
        Full path to the file
    """
    level1 = index // 1000000
    level2 = (index // 1000) % 1000
    level3 = index % 1000
    
    file_path = os.path.join(
        base_dir,
        f"{level1:03d}",
        f"{level2:03d}",
        f"{level3:03d}",
        f"{index:08d}.npy"
    )
    
    return file_path


class PrecomputedFeatureDataset(Dataset):
    """
    Dataset for loading pre-extracted video codes and text embeddings.
    
    This dataset loads features that were pre-extracted by extract_features.py,
    avoiding the need to encode videos and text during training.
    
    Features are stored in a 3-level hierarchical directory structure:
    - video_codes/level1/level2/level3/index.npy
    - text_embeddings/level1/level2/level3/index.npy
    """
    
    def __init__(
        self,
        features_dir,
        num_samples=None,
        start_index=0,
    ):
        """
        Args:
            features_dir: Directory containing extracted features (should have video_codes/ and text_embeddings/ subdirs)
            num_samples: Number of samples to use. If None, use all available samples.
            start_index: Starting index for samples (for resuming or subset selection)
        """
        self.features_dir = features_dir
        self.video_codes_dir = os.path.join(features_dir, "video_codes")
        self.text_embeddings_dir = os.path.join(features_dir, "text_embeddings")
        self.metadata_file = os.path.join(features_dir, "metadata.json")
        
        # Load metadata
        if os.path.exists(self.metadata_file):
            import json
            with open(self.metadata_file, 'r') as f:
                self.metadata = json.load(f)
            logger.info(f"Loaded metadata from {self.metadata_file}")
            logger.info(f"  Total samples in metadata: {self.metadata.get('num_samples', 'unknown')}")
            
            # Get available indices from metadata
            if 'samples' in self.metadata and len(self.metadata['samples']) > 0:
                available_indices = sorted([s['index'] for s in self.metadata['samples']])
            else:
                # Fallback: infer from directory structure
                available_indices = self._scan_hierarchical_directory(self.video_codes_dir)
        else:
            # If no metadata, scan directory structure
            logger.warning(f"Metadata file not found: {self.metadata_file}, scanning directory structure")
            self.metadata = {}
            available_indices = self._scan_hierarchical_directory(self.video_codes_dir)
        
        # Filter by start_index and num_samples
        available_indices = [idx for idx in available_indices if idx >= start_index]
        if num_samples is not None:
            available_indices = available_indices[:num_samples]
        
        self.indices = available_indices
        logger.info(f"PrecomputedFeatureDataset: {len(self.indices)} samples available")
        if len(self.indices) > 0:
            logger.info(f"  Index range: {min(self.indices)} to {max(self.indices)}")
    
    def _scan_hierarchical_directory(self, base_dir):
        """
        Scan hierarchical directory structure to find all available indices.
        
        Args:
            base_dir: Base directory to scan
            
        Returns:
            List of available indices
        """
        available_indices = []
        
        if not os.path.exists(base_dir):
            raise FileNotFoundError(f"Directory not found: {base_dir}")
        
        # Scan level1 directories (000-999)
        for level1 in range(1000):
            level1_dir = os.path.join(base_dir, f"{level1:03d}")
            if not os.path.exists(level1_dir):
                continue
            
            # Scan level2 directories (000-999)
            for level2 in range(1000):
                level2_dir = os.path.join(level1_dir, f"{level2:03d}")
                if not os.path.exists(level2_dir):
                    continue
                
                # Scan level3 directories (000-999)
                for level3 in range(1000):
                    level3_dir = os.path.join(level2_dir, f"{level3:03d}")
                    if not os.path.exists(level3_dir):
                        continue
                    
                    # List all .npy files in level3 directory
                    for filename in os.listdir(level3_dir):
                        if filename.endswith('.npy'):
                            try:
                                index = int(filename.replace('.npy', ''))
                                available_indices.append(index)
                            except ValueError:
                                continue
        
        return sorted(available_indices)
    
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, idx):
        sample_idx = self.indices[idx]
        
        # Get hierarchical paths
        video_code_path = get_hierarchical_path(self.video_codes_dir, sample_idx)
        text_embedding_path = get_hierarchical_path(self.text_embeddings_dir, sample_idx)
        
        # Load video codes
        # Note: We load directly (not mmap) to avoid storage sharing issues with torch
        # The files are small enough (video codes are int32, typically < 1MB per sample)
        if not os.path.exists(video_code_path):
            raise FileNotFoundError(f"Video code not found: {video_code_path}")
        video_codes_np = np.load(video_code_path)  # [F', H', W']
        # Use torch.tensor() instead of torch.from_numpy() to ensure a complete copy
        # This avoids "Trying to resize storage that is not resizable" errors in DataLoader collate
        video_codes = torch.tensor(video_codes_np, dtype=torch.int32)  # CPU tensor, int32, fully copied
        del video_codes_np  # Release numpy array reference
        
        # Load text embedding
        # Note: We load directly (not mmap) to avoid storage sharing issues with torch
        if not os.path.exists(text_embedding_path):
            raise FileNotFoundError(f"Text embedding not found: {text_embedding_path}")
        text_embedding_np = np.load(text_embedding_path)  # [L, D]
        # Use torch.tensor() instead of torch.from_numpy() to ensure a complete copy
        # Preserve original dtype (should be float16 from extraction)
        text_embedding_dtype = torch.float16 if text_embedding_np.dtype == np.float16 else torch.float32
        text_embedding = torch.tensor(text_embedding_np, dtype=text_embedding_dtype)  # CPU tensor, fully copied
        del text_embedding_np  # Release numpy array reference
        
        return {
            "video_codes": video_codes,  # [F', H', W'], CPU tensor, int32
            "text_embedding": text_embedding,  # [L, D], CPU tensor, float16/bfloat16
            "sample_index": sample_idx,
        }