dorsar
/

diffusion_policy

Model card Files Files and versions

xet

Community

dorsar commited on Nov 1, 2024

Commit

07e6695

verified ·

1 Parent(s): c4038fa

Delete diffusion_model.py

Browse files

Files changed (1) hide show

diffusion_model.py +0 -406

diffusion_model.py DELETED Viewed

@@ -1,406 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.utils.data import Dataset
-import zarr
-import numpy as np
-from diffusers import UNet1DModel, DDPMScheduler
-from diffusers.optimization import get_scheduler
-from diffusers.training_utils import EMAModel
-import torch.nn.functional as F
-from tqdm.auto import tqdm
-from dataclasses import dataclass
-from typing import Tuple, Optional
-import os
-import gdown
-import collections
-from imageio import get_writer
-# Data Configurations
-@dataclass
-class DataConfig:
-    """Configuration for dataset"""
-    # Dataset paths and download info
-    dataset_path: str = "pusht_cchi_v7_replay.zarr.zip"
-    dataset_gdrive_id: str = "1KY1InLurpMvJDRb14L9NlXT_fEsCvVUq&confirm=t"
-    # Sequence parameters
-    pred_horizon: int = 16  # Number of steps to predict
-    obs_horizon: int = 2    # Number of observations to condition on
-    action_horizon: int = 8 # Number of actions to execute
-    # Data dimensions
-    image_size: Tuple[int, int] = (96, 96)
-    image_channels: int = 3
-    action_dim: int = 2
-    state_dim: int = 5  # [agent_x, agent_y, block_x, block_y, block_angle]
-@dataclass
-class ModelConfig:
-    """Configuration for neural networks"""
-    # Observation encoding
-    obs_embed_dim: int = 256
-    # UNet configuration
-    sample_size: int = 16  # pred_horizon length
-    in_channels: int = 2   # action dimension
-    out_channels: int = 2  # action dimension
-    layers_per_block: int = 2
-    block_out_channels: Tuple[int, ...] = (128,)
-    norm_num_groups: int = 8
-    down_block_types: Tuple[str, ...] = ("DownBlock1D",) * 1
-    up_block_types: Tuple[str, ...] = ("UpBlock1D",) * 1
-    def __post_init__(self):
-        # For conditioning through input channels
-        self.total_in_channels = self.in_channels + self.obs_embed_dim //8 # actions + conditioning
-"""
-Helper Functions
-"""
-def create_sample_indices(
-        episode_ends:np.ndarray, sequence_length:int,
-        pad_before: int=0, pad_after: int=0):
-    indices = list()
-    for i in range(len(episode_ends)):
-        start_idx = 0
-        if i > 0:
-            start_idx = episode_ends[i-1]
-        end_idx = episode_ends[i]
-        episode_length = end_idx - start_idx
-        min_start = -pad_before
-        max_start = episode_length - sequence_length + pad_after
-        # range stops one idx before end
-        for idx in range(min_start, max_start+1):
-            buffer_start_idx = max(idx, 0) + start_idx
-            buffer_end_idx = min(idx+sequence_length, episode_length) + start_idx
-            start_offset = buffer_start_idx - (idx+start_idx)
-            end_offset = (idx+sequence_length+start_idx) - buffer_end_idx
-            sample_start_idx = 0 + start_offset
-            sample_end_idx = sequence_length - end_offset
-            indices.append([
-                buffer_start_idx, buffer_end_idx,
-                sample_start_idx, sample_end_idx])
-    indices = np.array(indices)
-    return indices
-def sample_sequence(train_data, sequence_length,
-                    buffer_start_idx, buffer_end_idx,
-                    sample_start_idx, sample_end_idx):
-    result = dict()
-    for key, input_arr in train_data.items():
-        sample = input_arr[buffer_start_idx:buffer_end_idx]
-        data = sample
-        if (sample_start_idx > 0) or (sample_end_idx < sequence_length):
-            data = np.zeros(
-                shape=(sequence_length,) + input_arr.shape[1:],
-                dtype=input_arr.dtype)
-            if sample_start_idx > 0:
-                data[:sample_start_idx] = sample[0]
-            if sample_end_idx < sequence_length:
-                data[sample_end_idx:] = sample[-1]
-            data[sample_start_idx:sample_end_idx] = sample
-        result[key] = data
-    return result
-def get_data_stats(data):
-    data = data.reshape(-1,data.shape[-1])
-    stats = {
-        'min': np.min(data, axis=0),
-        'max': np.max(data, axis=0)
-    }
-    return stats
-def normalize_data(data, stats):
-    # Normalize to [0,1]
-    ndata = (data - stats['min']) / (stats['max'] - stats['min'])
-    # Normalize to [-1, 1]
-    ndata = ndata * 2 - 1
-    return ndata
-def unnormalize_data(ndata, stats):
-    ndata = (ndata + 1) / 2
-    data = ndata * (stats['max'] - stats['min']) + stats['min']
-    return data
-# Dataset Class
-class PushTStateDataset(torch.utils.data.Dataset):
-    def __init__(self, dataset_path, pred_horizon, obs_horizon, action_horizon):
-        # Read from zarr dataset
-        dataset_root = zarr.open(dataset_path, 'r')
-        # All demonstration episodes are concatenated in the first dimension N
-        train_data = {
-            # (N, action_dim)
-            'action': dataset_root['data']['action'][:],
-            # (N, obs_dim)
-            'obs': dataset_root['data']['state'][:]
-        }
-        # Marks one-past the last index for each episode
-        episode_ends = dataset_root['meta']['episode_ends'][:]
-        # Compute start and end of each state-action sequence
-        # Also handles padding
-        indices = create_sample_indices(
-            episode_ends=episode_ends,
-            sequence_length=pred_horizon,
-            # Add padding such that each timestep in the dataset are seen
-            pad_before=obs_horizon-1,
-            pad_after=action_horizon-1)
-        # Compute statistics and normalize data to [-1,1]
-        stats = dict()
-        normalized_train_data = dict()
-        for key, data in train_data.items():
-            stats[key] = get_data_stats(data)
-            normalized_train_data[key] = normalize_data(data, stats[key])
-        self.indices = indices
-        self.stats = stats
-        self.normalized_train_data = normalized_train_data
-        self.pred_horizon = pred_horizon
-        self.action_horizon = action_horizon
-        self.obs_horizon = obs_horizon
-    def __len__(self):
-        return len(self.indices)
-    def __getitem__(self, idx):
-        # Get the start/end indices for this datapoint
-        buffer_start_idx, buffer_end_idx, \
-            sample_start_idx, sample_end_idx = self.indices[idx]
-        # Get normalized data using these indices
-        nsample = sample_sequence(
-            train_data=self.normalized_train_data,
-            sequence_length=self.pred_horizon,
-            buffer_start_idx=buffer_start_idx,
-            buffer_end_idx=buffer_end_idx,
-            sample_start_idx=sample_start_idx,
-            sample_end_idx=sample_end_idx
-        )
-        # Discard unused observations
-        nsample['obs'] = nsample['obs'][:self.obs_horizon,:]
-        return nsample
-# Model Classes
-class ObservationEncoder(nn.Module):
-    """Encodes observations for conditioning"""
-    def __init__(self, obs_dim: int, embed_dim: int):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.Linear(obs_dim, embed_dim * 2),
-            nn.Mish(),
-            nn.Linear(embed_dim * 2, embed_dim)
-        )
-    def forward(self, x):
-        # x: [batch, timesteps, obs_dim]
-        batch_size, timesteps, obs_dim = x.shape
-        x = x.reshape(-1, obs_dim)
-        x = self.net(x)
-        x = x.reshape(batch_size, timesteps * self.net[-1].out_features)
-        return x
-def train_diffusion():
-    """Train diffusion model using HuggingFace diffusers"""
-    # Configs
-    data_config = DataConfig()
-    model_config = ModelConfig()
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    num_epochs = 100
-    print(f"Using device: {device}")
-    # Create dataset (from zarr file)
-    dataset = PushTStateDataset(
-        dataset_path=data_config.dataset_path,
-        pred_horizon=data_config.pred_horizon,
-        obs_horizon=data_config.obs_horizon,
-        action_horizon=data_config.action_horizon
-    )
-    # Assign stats and define save directory
-    stats = dataset.stats
-    save_dir = "checkpoints"
-    os.makedirs(save_dir, exist_ok=True)
-    dataloader = torch.utils.data.DataLoader(
-        dataset,
-        batch_size=256,
-        num_workers=4,
-        shuffle=True,
-        pin_memory=True,
-        persistent_workers=True
-    )
-    # Create observation encoder
-    obs_encoder = ObservationEncoder(
-        obs_dim=data_config.state_dim,
-        embed_dim=model_config.obs_embed_dim
-    ).to(device)
-    # Create UNet1D model from diffusers
-    model = UNet1DModel(
-        sample_size=model_config.sample_size,
-        in_channels=model_config.total_in_channels,  # actions + conditioning
-        out_channels=model_config.out_channels,
-        layers_per_block=model_config.layers_per_block,
-        block_out_channels=model_config.block_out_channels,
-        norm_num_groups=model_config.norm_num_groups,
-        down_block_types=model_config.down_block_types,
-        up_block_types=model_config.up_block_types,
-    ).to(device)
-    # Create noise scheduler
-    noise_scheduler = DDPMScheduler(
-        num_train_timesteps=100,
-        beta_schedule="squaredcos_cap_v2",
-        clip_sample=True,
-        prediction_type="epsilon"
-    )
-    # Create projection layer OUTSIDE the training loop
-    obs_projection = nn.Linear(model_config.obs_embed_dim * data_config.obs_horizon,
-                             model_config.obs_embed_dim // 8).to(device)
-    # Update optimizer to include projection layer
-    optimizer = torch.optim.AdamW([
-        {'params': model.parameters()},
-        {'params': obs_encoder.parameters()},
-        {'params': obs_projection.parameters()}
-    ], lr=1e-4)
-    # Update EMA to include projection layer
-    ema = EMAModel(
-        parameters=list(model.parameters()) +
-                  list(obs_encoder.parameters()) +
-                  list(obs_projection.parameters()),
-        power=0.75
-    )
-    for epoch in range(num_epochs):
-        progress_bar = tqdm(total=len(dataloader), desc=f'Epoch {epoch}')
-        epoch_loss = []
-        for batch in dataloader:
-            # Get batch data
-            obs = batch['obs'].to(device)  # [batch, obs_horizon, obs_dim]
-            actions = batch['action'].to(device)  # [batch, pred_horizon, action_dim]
-            batch_size = obs.shape[0]
-            # Encode observations for conditioning
-            obs_embedding = obs_encoder(obs)  # [batch, obs_embed_dim * obs_horizon]
-            # Sample noise and timesteps
-            noise = torch.randn_like(actions)
-            timesteps = torch.randint(
-                0, noise_scheduler.config.num_train_timesteps,
-                (batch_size,), device=device
-            ).long()
-            # Add noise to actions according to noise schedule
-            noisy_actions = noise_scheduler.add_noise(actions, noise, timesteps)
-            # Reshape to channels format for UNet
-            # [batch, pred_horizon, channels] -> [batch, channels, pred_horizon]
-            noisy_actions = noisy_actions.transpose(1, 2)
-            noise = noise.transpose(1, 2)
-            # Project the observation embedding
-            obs_cond = obs_projection(obs_embedding)  # [batch, obs_embed_dim//8]
-            # Reshape to match sequence length
-            obs_cond = obs_cond.unsqueeze(-1).expand(-1, -1, noisy_actions.shape[-1])
-            # Concatenate along channel dimension
-            model_input = torch.cat([noisy_actions, obs_cond], dim=1)
-            noise_pred = model(
-                model_input,
-                timesteps,
-            ).sample  # Removed slicing [:, :data_config.action_dim]
-            # Calculate loss
-            loss = F.mse_loss(noise_pred, noise)
-            epoch_loss.append(loss.item())
-            # Optimize
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            # Update EMA parameters
-            ema.step(list(model.parameters()) +
-                     list(obs_encoder.parameters()) +
-                     list(obs_projection.parameters()))
-            # Update progress
-            progress_bar.update(1)
-            progress_bar.set_postfix(loss=loss.item())
-        progress_bar.close()
-        # Print epoch stats
-        avg_loss = sum(epoch_loss) / len(epoch_loss)
-        print(f"\nEpoch {epoch} average loss: {avg_loss:.6f}")
-        # Save checkpoint every 10 epochs
-        if (epoch + 1) % 10 == 0:
-            torch.save({
-                'epoch': epoch,
-                'model_state_dict': model.state_dict(),
-                'encoder_state_dict': obs_encoder.state_dict(),
-                'projection_state_dict': obs_projection.state_dict(),
-                'ema_state_dict': ema.state_dict(),
-                'optimizer_state_dict': optimizer.state_dict(),
-                # 'noise_scheduler_state_dict': noise_scheduler.state_dict(),  # Removed
-                'stats': stats,
-                'loss': avg_loss,
-            }, os.path.join(save_dir, f'diffusion_checkpoint_{epoch}.pt'))
-    return model, obs_encoder, obs_projection, ema, noise_scheduler, optimizer, stats
-def main():
-    # Download dataset if needed
-    config = DataConfig()
-    if not os.path.isfile(config.dataset_path):
-        print("Downloading dataset...")
-        gdown.download(id=config.dataset_gdrive_id, output=config.dataset_path, quiet=False)
-    # Create dataset
-    dataset = PushTStateDataset(
-        dataset_path=config.dataset_path,
-        pred_horizon=config.pred_horizon,
-        obs_horizon=config.obs_horizon,
-        action_horizon=config.action_horizon
-    )
-    # Test batch
-    dataloader = torch.utils.data.DataLoader(
-        dataset, batch_size=256, num_workers=1,
-        shuffle=True, pin_memory=True, persistent_workers=True
-    )
-    batch = next(iter(dataloader))
-    print("batch['obs'].shape:", batch['obs'].shape)
-    print("batch['action'].shape:", batch['action'].shape)
-if __name__ == "__main__":
-    main()
-    print("\nStarting diffusion model training...")
-    model, obs_encoder, obs_projection, ema, noise_scheduler, optimizer, stats = train_diffusion()
-    # Save final model
-    save_dir = "checkpoints"
-    os.makedirs(save_dir, exist_ok=True)
-    torch.save({
-        'model_state_dict': model.state_dict(),
-        'encoder_state_dict': obs_encoder.state_dict(),
-        'projection_state_dict': obs_projection.state_dict(),
-        'ema_state_dict': ema.state_dict(),
-        'optimizer_state_dict': optimizer.state_dict(),
-        # 'noise_scheduler_state_dict': noise_scheduler.state_dict(),
-        'stats': stats
-    }, os.path.join(save_dir, 'diffusion_final.pt'))