Delete Data_generation_tool_kit

Browse files

Files changed (4) hide show

Data_generation_tool_kit/.DS_Store +0 -0
Data_generation_tool_kit/dataloader.py +0 -179
Data_generation_tool_kit/generate.py +0 -291
Data_generation_tool_kit/train.py +0 -209

Data_generation_tool_kit/.DS_Store DELETED Viewed

Binary file (6.15 kB)

Data_generation_tool_kit/dataloader.py DELETED Viewed

@@ -1,179 +0,0 @@
-import torch
-import pandas as pd
-import numpy as np
-from sklearn.preprocessing import MinMaxScaler
-import joblib
-import os
-from typing import Tuple, Dict
-import warnings
-warnings.filterwarnings('ignore')
-class MultiHouseDataset(torch.utils.data.Dataset):
-    def __init__(self, data_dir: str, window_size: int = 96, step_size: int = 1,
-                 scaler_path: str = 'global_scaler.gz', cache_in_memory: bool = True,
-                 dtype: torch.dtype = torch.float32, limit_to_one_year: bool = True):
-        self.window_size = window_size
-        self.step_size = step_size
-        self.cache_in_memory = cache_in_memory
-        self.dtype = dtype
-        self.limit_to_one_year = limit_to_one_year
-        all_files = sorted([f for f in os.listdir(data_dir) if f.endswith('.csv')])
-        print(f"Found {len(all_files)} house files in '{data_dir}'.")
-        self.num_houses = len(all_files)
-        print("Reading house data...")
-        if self.limit_to_one_year:
-            print("INFO: Limiting data to the first year (17,520 samples) for each house.")
-        data_per_house = []
-        timestamps_per_house = []
-        SAMPLES_PER_YEAR = 17520
-        for filename in all_files:
-            df = pd.read_csv(os.path.join(data_dir, filename), parse_dates=['timestamp'])
-            timestamps_per_house.append(df['timestamp'].values)
-            time_series_values = df[['grid_usage', 'solar_generation']].values.astype(np.float32)
-            if self.limit_to_one_year:
-                time_series_values = time_series_values[:SAMPLES_PER_YEAR]
-            num_timesteps = len(time_series_values)
-            timesteps_of_day = np.arange(num_timesteps) % 48
-            sin_time = np.sin(2 * np.pi * timesteps_of_day / 48.0).astype(np.float32)
-            cos_time = np.cos(2 * np.pi * timesteps_of_day / 48.0).astype(np.float32)
-            time_series_values = np.concatenate([
-                time_series_values,
-                sin_time[:, np.newaxis],
-                cos_time[:, np.newaxis]
-            ], axis=1)
-            data_per_house.append(time_series_values)
-        if os.path.exists(scaler_path):
-            scaler = joblib.load(scaler_path)
-            print(f"Scaler loaded from {scaler_path}")
-        else:
-            print("Fitting global scaler...")
-            combined_data = np.vstack(data_per_house)
-            scaler = MinMaxScaler(feature_range=(-1, 1))
-            scaler.fit(combined_data)
-            joblib.dump(scaler, scaler_path)
-            print(f"Scaler saved to {scaler_path}")
-        if self.cache_in_memory:
-            print("Caching normalized data...")
-            self.normalized_data_per_house = []
-            for series in data_per_house:
-                normalized = scaler.transform(series)
-                tensor_data = torch.from_numpy(normalized).to(dtype=self.dtype)
-                self.normalized_data_per_house.append(tensor_data)
-        else:
-            self.normalized_data_per_house = []
-            for series in data_per_house:
-                self.normalized_data_per_house.append(scaler.transform(series))
-        del data_per_house
-        print("Pre-computing mappings...")
-        self.windows_per_house = [(len(d) - self.window_size) // self.step_size + 1 for d in self.normalized_data_per_house]
-        self.cumulative_windows = np.cumsum([0] + self.windows_per_house)
-        self.total_windows = self.cumulative_windows[-1]
-        self.sample_to_house = np.empty(self.total_windows, dtype=np.int32)
-        self.sample_to_local_idx = np.empty(self.total_windows, dtype=np.int32)
-        self.sample_to_day_of_week = np.empty(self.total_windows, dtype=np.int32)
-        self.sample_to_day_of_year = np.empty(self.total_windows, dtype=np.int32)
-        for house_idx in range(self.num_houses):
-            start_global_idx = self.cumulative_windows[house_idx]
-            end_global_idx = self.cumulative_windows[house_idx + 1]
-            num_windows_for_this_house = self.windows_per_house[house_idx]
-            self.sample_to_house[start_global_idx:end_global_idx] = house_idx
-            local_indices = np.arange(num_windows_for_this_house) * self.step_size
-            self.sample_to_local_idx[start_global_idx:end_global_idx] = local_indices
-            house_timestamps = pd.Series(timestamps_per_house[house_idx][local_indices])
-            self.sample_to_day_of_week[start_global_idx:end_global_idx] = house_timestamps.dt.dayofweek
-            self.sample_to_day_of_year[start_global_idx:end_global_idx] = house_timestamps.dt.dayofyear - 1
-        print(f"Dataset initialized. Total windows: {self.total_windows} from {self.num_houses} houses.")
-        memory_usage = sum(data.numel() * data.element_size() for data in self.normalized_data_per_house) / 1e6 if self.cache_in_memory else 0
-        print(f"Memory usage for cached tensors: {memory_usage:.1f} MB")
-    def __len__(self) -> int:
-        return self.total_windows
-    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        if idx < 0 or idx >= self.total_windows:
-            raise IndexError("Index out of range")
-        house_index = self.sample_to_house[idx]
-        local_start_pos = self.sample_to_local_idx[idx]
-        window_data = self.normalized_data_per_house[house_index][local_start_pos : local_start_pos + self.window_size]
-        conditions = {
-            "house_id": torch.tensor(house_index, dtype=torch.long),
-            "day_of_week": torch.tensor(self.sample_to_day_of_week[idx], dtype=torch.long),
-            "day_of_year": torch.tensor(self.sample_to_day_of_year[idx], dtype=torch.long),
-        }
-        return window_data, conditions
-    def get_memory_usage(self) -> dict:
-        if self.cache_in_memory:
-            tensor_memory = sum(data.numel() * data.element_size() for data in self.normalized_data_per_house) / 1e6
-        else:
-            tensor_memory = 0
-        mapping_memory = (self.sample_to_house.nbytes + self.sample_to_local_idx.nbytes) / 1e6
-        return {
-            'tensor_cache_mb': tensor_memory,
-            'mapping_arrays_mb': mapping_memory,
-            'total_mb': tensor_memory + mapping_memory
-        }
-class LatentDataset(torch.utils.data.Dataset):
-    def __init__(self, latent_vectors: torch.Tensor, house_ids: torch.Tensor):
-        assert len(latent_vectors) == len(house_ids), "Latent vectors and house IDs must have same length"
-        self.latent_vectors = latent_vectors.contiguous()
-        self.house_ids = house_ids.contiguous()
-    def __len__(self) -> int:
-        return len(self.latent_vectors)
-    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
-        return self.latent_vectors[idx], self.house_ids[idx]
-if __name__ == "__main__":
-    import time
-    DATA_DIRECTORY = './data/per_house/'
-    if os.path.exists(DATA_DIRECTORY):
-        print("--- Testing Dataset Setup ---")
-        start_time = time.time()
-        dataset = MultiHouseDataset(data_dir=DATA_DIRECTORY, window_size=96, step_size=96)
-        init_time = time.time() - start_time
-        print(f"Dataset initialization: {init_time:.2f}s")
-        print(f"Memory usage: {dataset.get_memory_usage()}")
-        if len(dataset) > 0:
-            first_sample, first_conditions = dataset[0]
-            print(f"\nSample data shape: {first_sample.shape}")
-            print(f"Sample conditions: {first_conditions}")
-            print(f"Total houses: {dataset.num_houses}")
-    else:
-        print(f"ERROR: Data directory not found at '{DATA_DIRECTORY}'. Please create and populate this directory.")

Data_generation_tool_kit/generate.py DELETED Viewed

@@ -1,291 +0,0 @@
-import torch
-import torch.nn as nn
-import numpy as np
-import pandas as pd
-import os
-import joblib
-import math
-import datetime
-from tqdm import tqdm
-import matplotlib.pyplot as plt
-import matplotlib.dates as mdates
-# =============================================================================
-# 1. MODEL CLASS DEFINITIONS
-# =============================================================================
-try:
-    from hierarchical_diffusion_model import (
-        HierarchicalDiffusionModel, ConditionalUnet, ResnetBlock1D,
-        AttentionBlock1D, DownBlock1D, UpBlock1D,
-        SinusoidalPositionEmbeddings, ImprovedDiffusionModel
-    )
-    print("Diffusion model classes imported.")
-except ImportError:
-    print("="*50)
-    print("ERROR: Could not import model classes from 'hierarchical_diffusion_model.py'.")
-    print("="*50)
-    exit()
-# =============================================================================
-# 2. HELPER FUNCTIONS
-# =============================================================================
-def add_amplitude_jitter(series, daily_samples=48, scale=0.05):
-    series = series.copy()
-    num_days = len(series) // daily_samples
-    if num_days == 0: return series
-    factors = np.random.normal(1.0, scale, size=num_days)
-    for d in range(num_days):
-        start, end = d * daily_samples, (d + 1) * daily_samples
-        series[start:end] *= factors[d]
-    return series
-def add_cloud_variability(pv, timestamps, base_sigma=0.25):
-    pv = pv.copy()
-    if len(pv) == 0: return pv
-    days = pd.Series(pv, index=timestamps).groupby(timestamps.date)
-    adjusted = []
-    for day, vals in days:
-        cloud_factor = np.random.lognormal(mean=-0.02, sigma=base_sigma)
-        hour = vals.index.hour
-        day_pv = np.where((hour >= 6) & (hour <= 18), vals * cloud_factor, 0.0)
-        adjusted.append(day_pv)
-    if not adjusted: return np.array([])
-    return np.concatenate(adjusted)
-def enforce_physics(df: pd.DataFrame, pv_cap_kw: float | None = None) -> pd.DataFrame:
-    df = df.copy()
-    df['solar_generation'] = np.clip(df['solar_generation'], 0.0, None)
-    hour = df.index.hour
-    night = (hour < 7) | (hour > 18)
-    df.loc[night, 'solar_generation'] = 0.0
-    export_mask = df['grid_usage'] < 0
-    if export_mask.any():
-        limited_export = -np.minimum(-df.loc[export_mask, 'grid_usage'], df.loc[export_mask, 'solar_generation'])
-        df.loc[export_mask, 'grid_usage'] = limited_export
-        zero_pv_neg_grid = export_mask & (df['solar_generation'] <= 1e-6)
-        df.loc[zero_pv_neg_grid, 'grid_usage'] = 0.0
-    if pv_cap_kw is not None:
-        df['solar_generation'] = np.clip(df['solar_generation'], 0.0, pv_cap_kw)
-    return df
-def calculate_generation_length(duration: str, samples_per_day: int) -> int:
-    """Calculate samples needed."""
-    if duration == '1_year':
-        return 365 * samples_per_day
-    elif duration == '6_months':
-        return 182 * samples_per_day
-    elif duration == '2_months':
-        return 60 * samples_per_day
-    elif duration == '1_month':
-        return 30 * samples_per_day
-    elif duration == '14_days':
-        return 14 * samples_per_day
-    elif duration == '7_days':
-        return 7 * samples_per_day
-    elif duration == '2_days':
-        return 2 * samples_per_day
-    else:
-        print(f"Warning: Unknown duration '{duration}'. Defaulting to 1 year.")
-        return 365 * samples_per_day
-# =============================================================================
-# 3. HARDCODED CONFIGURATION
-# =============================================================================
-class Config:
-    # --- Paths and Directories ---
-    MODEL_PATH = './trained_model/best_hierarchical_model.pth'
-    SCALER_PATH = './data/global_scaler.gz'
-    ORIGINAL_DATA_DIR = './data/per_house'
-    OUTPUT_DIR = './generated_data'
-    # --- Generation Parameters ---
-    GENERATION_DURATION = '1_year'
-    NUM_PROFILES_TO_GENERATE = 2000
-    PLOTS_TO_GENERATE = 20
-    GENERATION_BATCH_SIZE = 128
-    # --- Model & Training Parameters ---
-    TRAINING_WINDOW_DAYS = 14
-    NUM_HOUSES_TRAINED_ON = 300
-    SAMPLES_PER_DAY = 48
-    NUM_FEATURES = 4
-    DOWNSCALE_FACTOR = 4
-    EMBEDDING_DIM = 64
-    HIDDEN_SIZE = 512
-    HIDDEN_DIMS = [HIDDEN_SIZE // 4, HIDDEN_SIZE // 2, HIDDEN_SIZE]
-    DROPOUT = 0.1
-    USE_ATTENTION = True
-    DIFFUSION_TIMESTEPS = 500
-    BLOCKS_PER_LEVEL = 3
-# =============================================================================
-# 4. MAIN GENERATION LOGIC
-# =============================================================================
-def main(cfg, run_output_dir):
-    """Main generation logic."""
-    DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
-    print(f"Using device: {DEVICE}")
-    csv_output_dir = os.path.join(run_output_dir, 'csv')
-    plot_output_dir = os.path.join(run_output_dir, 'plots')
-    os.makedirs(csv_output_dir, exist_ok=True)
-    os.makedirs(plot_output_dir, exist_ok=True)
-    print("Loading resources...")
-    try:
-        scaler = joblib.load(cfg.SCALER_PATH)
-        if scaler.n_features_in_ != cfg.NUM_FEATURES:
-            print(f"WARNING: Scaler was fit on {scaler.n_features_in_} features, but model expects {cfg.NUM_FEATURES}.")
-        original_files = sorted([f for f in os.listdir(cfg.ORIGINAL_DATA_DIR) if f.endswith('.csv')])
-        if not original_files:
-            raise FileNotFoundError("No original data files found to extract timestamps.")
-        sample_original_df = pd.read_csv(os.path.join(cfg.ORIGINAL_DATA_DIR, original_files[0]), index_col='timestamp', parse_dates=True)
-        # Load 1 year timestamps
-        full_timestamps = sample_original_df.index[:(365 * cfg.SAMPLES_PER_DAY)]
-        # Goal length
-        total_samples_needed = calculate_generation_length(cfg.GENERATION_DURATION, cfg.SAMPLES_PER_DAY)
-        # Training window length
-        TRAINING_WINDOW_SAMPLES = cfg.TRAINING_WINDOW_DAYS * cfg.SAMPLES_PER_DAY
-        # Clamping to max
-        if total_samples_needed > len(full_timestamps):
-            print(f"Warning: Requested {total_samples_needed} samples, but file has {len(full_timestamps)}. Clamping to max.")
-            total_samples_needed = len(full_timestamps)
-        print(f"Goal: Generate {total_samples_needed} samples ({cfg.GENERATION_DURATION}) per profile.")
-        print(f"Strategy: Stitching {TRAINING_WINDOW_SAMPLES}-sample chunks.")
-        model = HierarchicalDiffusionModel(
-            in_channels=cfg.NUM_FEATURES,
-            num_houses=cfg.NUM_HOUSES_TRAINED_ON,
-            downscale_factor=cfg.DOWNSCALE_FACTOR,
-            embedding_dim=cfg.EMBEDDING_DIM,
-            hidden_dims=cfg.HIDDEN_DIMS,
-            dropout=cfg.DROPOUT,
-            use_attention=cfg.USE_ATTENTION,
-            num_timesteps=cfg.DIFFUSION_TIMESTEPS,
-            blocks_per_level=cfg.BLOCKS_PER_LEVEL
-        )
-        model.load_state_dict(torch.load(cfg.MODEL_PATH, map_location=DEVICE))
-        model.to(DEVICE)
-        model.eval()
-        print("Model, scaler, timestamps ready.")
-    except FileNotFoundError as e:
-        print(f"ERROR: A required file was not found. Details: {e}")
-        return
-    except Exception as e:
-        print(f"An error occurred during setup: {e}")
-        return
-    num_batches = math.ceil(cfg.NUM_PROFILES_TO_GENERATE / cfg.GENERATION_BATCH_SIZE)
-    house_counter = 0
-    pbar = tqdm(range(num_batches), desc="Generating Batches")
-    for i in pbar:
-        current_batch_size = min(cfg.GENERATION_BATCH_SIZE, cfg.NUM_PROFILES_TO_GENERATE - house_counter)
-        if current_batch_size <= 0: break
-        pbar.set_postfix({'batch_size': current_batch_size})
-        # --- STITCHING LOGIC ---
-        num_chunks_needed = math.ceil(total_samples_needed / TRAINING_WINDOW_SAMPLES)
-        batch_chunks_list = []
-        for chunk_idx in range(num_chunks_needed):
-            # Calculate chunk length
-            samples_remaining = total_samples_needed - (chunk_idx * TRAINING_WINDOW_SAMPLES)
-            current_chunk_length = min(TRAINING_WINDOW_SAMPLES, samples_remaining)
-            shape_to_generate = (current_chunk_length, cfg.NUM_FEATURES)
-            # Generate random conditions
-            sample_conditions = {
-                "house_id": torch.randint(0, cfg.NUM_HOUSES_TRAINED_ON, (current_batch_size,), device=DEVICE),
-                "day_of_week": torch.randint(0, 7, (current_batch_size,), device=DEVICE),
-                "day_of_year": torch.randint(0, 365, (current_batch_size,), device=DEVICE)
-            }
-            with torch.no_grad():
-                # Generate one chunk
-                generated_chunk_data = model.sample(current_batch_size, sample_conditions, shape=shape_to_generate)
-            batch_chunks_list.append(generated_chunk_data.cpu().numpy())
-        # Stitch chunks together
-        generated_data_np = np.concatenate(batch_chunks_list, axis=1)
-        # --- END OF STITCHING LOGIC ---
-        # --- Post-processing loop ---
-        for j in range(current_batch_size):
-            current_house_num = house_counter + 1
-            # Select timestamps
-            profile_timestamps = full_timestamps[:total_samples_needed]
-            normalized_series = generated_data_np[j]
-            unscaled_series = scaler.inverse_transform(normalized_series)
-            df = pd.DataFrame(
-                unscaled_series,
-                columns=['grid_usage', 'solar_generation', 'sin_time', 'cos_time'],
-                index=profile_timestamps
-            )
-            df = enforce_physics(df)
-            df['grid_usage'] = add_amplitude_jitter(df['grid_usage'].values, scale=0.08, daily_samples=cfg.SAMPLES_PER_DAY)
-            df['solar_generation'] = add_cloud_variability(df['solar_generation'].values, df.index, base_sigma=0.3)
-            df = enforce_physics(df)
-            df_to_save = df[['grid_usage', 'solar_generation']]
-            df_to_save.to_csv(os.path.join(csv_output_dir, f'generated_house_{current_house_num}.csv'))
-            if house_counter < cfg.PLOTS_TO_GENERATE:
-                plot_df = df_to_save.head(cfg.SAMPLES_PER_DAY * 14)
-                plt.figure(figsize=(15, 6))
-                plt.plot(plot_df.index, plot_df['grid_usage'], label='Grid Usage', color='dodgerblue', alpha=0.9)
-                plt.plot(plot_df.index, plot_df['solar_generation'], label='Solar Generation', color='darkorange', alpha=0.9)
-                plt.title(f'Generated Data for Profile {current_house_num} (First 14 Days)')
-                plt.xlabel('Timestamp'); plt.ylabel('Power (kW)'); plt.legend(); plt.grid(True, which='both', linestyle='--', linewidth=0.5)
-                plt.tight_layout()
-                plt.savefig(os.path.join(plot_output_dir, f'generated_profile_{current_house_num}_plot.png'))
-                plt.close()
-            house_counter += 1
-    print(f"\nSuccessfully generated and saved {house_counter} house profiles.")
-    if cfg.PLOTS_TO_GENERATE > 0:
-        print(f"Plots saved to '{plot_output_dir}'.")
-# =============================================================================
-# 5. --- Main execution block ---
-# =============================================================================
-if __name__ == '__main__':
-    config = Config()
-    # Create unique output directory
-    run_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    run_name = f"generation_run_{config.GENERATION_DURATION}_{run_timestamp}"
-    run_output_dir = os.path.join(config.OUTPUT_DIR, run_name)
-    os.makedirs(run_output_dir, exist_ok=True)
-    print(f"Starting new generation run: {run_name}")
-    print(f"All outputs will be saved to: {run_output_dir}")
-    # Run generation
-    main(config, run_output_dir)
-    print("\nGeneration process complete.")

Data_generation_tool_kit/train.py DELETED Viewed

@@ -1,209 +0,0 @@
-import torch
-from torch.utils.data import DataLoader, random_split, Subset
-from torch.cuda.amp import autocast, GradScaler
-from tqdm import tqdm
-import numpy as np
-import os
-import datetime
-import pandas as pd
-import matplotlib.pyplot as plt
-import math
-import joblib
-from dataloader import MultiHouseDataset
-from hierarchical_diffusion_model import HierarchicalDiffusionModel
-if torch.cuda.is_available():
-    DEVICE = "cuda"
-    torch.backends.cudnn.benchmark = True
-    torch.backends.cuda.matmul.allow_tf32 = True
-    print("Using NVIDIA CUDA backend.")
-elif torch.backends.mps.is_available():
-    DEVICE = "mps"
-    print("Using Apple MPS backend.")
-else:
-    DEVICE = "cpu"
-    print("Using CPU.")
-EPOCHS = 200
-LEARNING_RATE = 1e-4
-BATCH_SIZE = 512
-USE_AMP = True
-GRADIENT_CLIP_VAL = 0.1
-WINDOW_DURATION = '14_days'
-DATA_DIRECTORY = './data/per_house'
-NUM_WORKERS = os.cpu_count() // 2
-PIN_MEMORY = True
-USE_ATTENTION = True
-DROPOUT = 0.1
-HIDDEN_SIZE = 512
-EMBEDDING_DIM = 64
-DIFFUSION_TIMESTEPS = 500
-DOWNSCALE_FACTOR = 4
-def calculate_window_size(duration: str) -> int:
-    SAMPLES_PER_DAY = 48
-    mapping = {
-        '2_days': 2 * SAMPLES_PER_DAY,
-        '7_days': 7 * SAMPLES_PER_DAY,
-        '14_days': 14 * SAMPLES_PER_DAY,
-        '15_days': 15 * SAMPLES_PER_DAY,
-        '30_days': 30 * SAMPLES_PER_DAY
-    }
-    if duration not in mapping:
-        raise ValueError(f"Invalid WINDOW_DURATION: {duration}")
-    return mapping[duration]
-def denormalize_data(normalized_data, scaler_path='global_scaler.gz'):
-    scaler = joblib.load(scaler_path)
-    original_shape = normalized_data.shape
-    if len(original_shape) == 3:
-        batch_size, seq_len, features = original_shape
-        normalized_flat = normalized_data.reshape(-1, features)
-        denormalized_flat = scaler.inverse_transform(normalized_flat)
-        return denormalized_flat.reshape(original_shape)
-    else:
-        return scaler.inverse_transform(normalized_data)
-def moving_average(data, window_size):
-    return np.convolve(data, np.ones(window_size), 'valid') / window_size
-def save_and_plot_loss(loss_dict, title, filepath, window_size=10):
-    plt.figure(figsize=(12, 6))
-    for label, losses in loss_dict.items():
-        pd.DataFrame({label: losses}).to_csv(f"{filepath}_{label.lower().replace(' ', '_')}.csv", index=False)
-        plt.plot(losses, label=f'Raw {label}', alpha=0.3)
-        if len(losses) > window_size:
-            smoothed_losses = moving_average(losses, window_size)
-            plt.plot(np.arange(window_size - 1, len(losses)), smoothed_losses, label=f'Smoothed {label}')
-    plt.title(title)
-    plt.xlabel('Epoch'); plt.ylabel('Loss')
-    plt.legend(); plt.grid(True)
-    plt.savefig(f"{filepath}.png"); plt.close()
-    print(f" Loss plot saved to {filepath}.png")
-def train_diffusion(log_dir, model_save_path):
-    print("--- Starting Hierarchical Diffusion Training ---")
-    window_size = calculate_window_size(WINDOW_DURATION)
-    print(f"Using window duration: {WINDOW_DURATION} ({window_size} samples)")
-    dataset = MultiHouseDataset(
-        data_dir=DATA_DIRECTORY,
-        window_size=window_size,
-        step_size=window_size//2,
-        limit_to_one_year=False
-    )
-    print(f"Dataset loaded: {len(dataset)} samples, {dataset.num_houses} houses, {dataset[0][0].shape[1]} features.")
-    val_split = 0.1
-    val_size = int(len(dataset) * val_split)
-    train_size = len(dataset) - val_size
-    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
-    print(f"Train size: {train_size}, Validation size: {val_size}")
-    train_dataloader = DataLoader(
-        train_dataset, batch_size=BATCH_SIZE, shuffle=True,
-        num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY, drop_last=True
-    )
-    val_dataloader = DataLoader(
-        val_dataset, batch_size=BATCH_SIZE*2, shuffle=False,
-        num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY
-    )
-    channel_weights = torch.tensor([1.0, 8.0, 1.0, 1.0], device=DEVICE)
-    print(f"Using channel weights: {channel_weights}")
-    model = HierarchicalDiffusionModel(
-        in_channels=dataset[0][0].shape[1],
-        num_houses=dataset.num_houses,
-        downscale_factor=DOWNSCALE_FACTOR,
-        channel_weights=channel_weights,
-        embedding_dim=EMBEDDING_DIM,
-        hidden_dims=[HIDDEN_SIZE // 4, HIDDEN_SIZE // 2, HIDDEN_SIZE],
-        dropout=DROPOUT,
-        use_attention=USE_ATTENTION,
-        num_timesteps=DIFFUSION_TIMESTEPS,
-        blocks_per_level=3
-    ).to(DEVICE)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
-    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
-    scaler = GradScaler(enabled=(USE_AMP and DEVICE == "cuda"))
-    train_losses, val_losses = [], []
-    best_val_loss = float('inf')
-    print(f"Starting training for {EPOCHS} epochs...")
-    for epoch in range(EPOCHS):
-        model.train()
-        total_train_loss = 0.0
-        pbar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS} (Train)")
-        for clean_data, conditions in pbar:
-            clean_data = clean_data.to(DEVICE, non_blocking=PIN_MEMORY)
-            conditions = {k: v.to(DEVICE, non_blocking=PIN_MEMORY) for k, v in conditions.items()}
-            optimizer.zero_grad(set_to_none=True)
-            with autocast(enabled=(USE_AMP and DEVICE == "cuda")):
-                loss = model(clean_data, conditions)
-            scaler.scale(loss).backward()
-            scaler.unscale_(optimizer)
-            torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP_VAL)
-            scaler.step(optimizer)
-            scaler.update()
-            total_train_loss += loss.item()
-            pbar.set_postfix({'loss': f'{loss.item():.6f}', 'lr': f'{scheduler.get_last_lr()[0]:.2e}'})
-        avg_train_loss = total_train_loss / len(train_dataloader)
-        train_losses.append(avg_train_loss)
-        model.eval()
-        total_val_loss = 0.0
-        with torch.no_grad():
-            for clean_data, conditions in tqdm(val_dataloader, desc="Validating"):
-                clean_data = clean_data.to(DEVICE, non_blocking=PIN_MEMORY)
-                conditions = {k: v.to(DEVICE, non_blocking=PIN_MEMORY) for k, v in conditions.items()}
-                with autocast(enabled=(USE_AMP and DEVICE == "cuda")):
-                    loss = model(clean_data, conditions)
-                total_val_loss += loss.item()
-        avg_val_loss = total_val_loss / len(val_dataloader)
-        val_losses.append(avg_val_loss)
-        print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f}")
-        if avg_val_loss < best_val_loss:
-            best_val_loss = avg_val_loss
-            torch.save(model.state_dict(), model_save_path)
-            print(f"New best model saved to {model_save_path} (Val Loss: {best_val_loss:.6f})")
-        scheduler.step()
-    print("--- Training complete ---")
-    save_and_plot_loss(
-        {'Train Loss': train_losses, 'Validation Loss': val_losses},
-        'Hierarchical Diffusion Model Training & Validation Loss',
-        os.path.join(log_dir, 'diffusion_loss_curves')
-    )
-    return dataset
-if __name__ == "__main__":
-    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    run_name = f"hierarchical_diffusion_{WINDOW_DURATION}_{timestamp}"
-    log_dir = os.path.join("./training_logs", run_name)
-    os.makedirs(log_dir, exist_ok=True)
-    model_path = os.path.join(log_dir, 'best_hierarchical_model.pth')
-    print(f"Starting new run: {run_name}")
-    print(f"Logs and models will be saved to: {log_dir}")
-    full_dataset = train_diffusion(log_dir=log_dir, model_save_path=model_path)
-    print("\nTraining and best model saving complete.")
-    print(f"Model saved to: {model_path}")
-    print(f"Loss curves saved to: {os.path.join(log_dir, 'diffusion_loss_curves.png')}")