File size: 7,801 Bytes

f884fa5

import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib
import os
from typing import Tuple, Dict
import warnings
warnings.filterwarnings('ignore')

class MultiHouseDataset(torch.utils.data.Dataset):
  
    def __init__(self, data_dir: str, window_size: int = 96, step_size: int = 1,
                 scaler_path: str = 'global_scaler.gz', cache_in_memory: bool = True,
                 dtype: torch.dtype = torch.float32, limit_to_one_year: bool = True):
        self.window_size = window_size
        self.step_size = step_size
        self.cache_in_memory = cache_in_memory
        self.dtype = dtype
        self.limit_to_one_year = limit_to_one_year
        
        all_files = sorted([f for f in os.listdir(data_dir) if f.endswith('.csv')])
        print(f"Found {len(all_files)} house files in '{data_dir}'.")
        
        self.num_houses = len(all_files)
        
        print("Reading house data...")
        if self.limit_to_one_year:
            print("INFO: Limiting data to the first year (17,520 samples) for each house.")
            
        data_per_house = []
        timestamps_per_house = []
    
        SAMPLES_PER_YEAR = 17520 
        
        for filename in all_files:
            df = pd.read_csv(os.path.join(data_dir, filename), parse_dates=['timestamp'])
            timestamps_per_house.append(df['timestamp'].values)
            time_series_values = df[['grid_usage', 'solar_generation']].values.astype(np.float32)

            if self.limit_to_one_year:
                time_series_values = time_series_values[:SAMPLES_PER_YEAR]

            num_timesteps = len(time_series_values)
            timesteps_of_day = np.arange(num_timesteps) % 48 

            sin_time = np.sin(2 * np.pi * timesteps_of_day / 48.0).astype(np.float32)
            cos_time = np.cos(2 * np.pi * timesteps_of_day / 48.0).astype(np.float32)

            time_series_values = np.concatenate([
                time_series_values, 
                sin_time[:, np.newaxis],
                cos_time[:, np.newaxis]
            ], axis=1)

            data_per_house.append(time_series_values)

        if os.path.exists(scaler_path):
            scaler = joblib.load(scaler_path)
            print(f"Scaler loaded from {scaler_path}")
        else:
            print("Fitting global scaler...")
            combined_data = np.vstack(data_per_house)
            scaler = MinMaxScaler(feature_range=(-1, 1))
            scaler.fit(combined_data)
            joblib.dump(scaler, scaler_path)
            print(f"Scaler saved to {scaler_path}")
        
        if self.cache_in_memory:
            print("Caching normalized data...")
            self.normalized_data_per_house = []
            for series in data_per_house:
                normalized = scaler.transform(series)
                tensor_data = torch.from_numpy(normalized).to(dtype=self.dtype)
                self.normalized_data_per_house.append(tensor_data)
        else:
            self.normalized_data_per_house = []
            for series in data_per_house:
                self.normalized_data_per_house.append(scaler.transform(series))
        
        del data_per_house
            
        print("Pre-computing mappings...")
        
        self.windows_per_house = [(len(d) - self.window_size) // self.step_size + 1 for d in self.normalized_data_per_house]
        self.cumulative_windows = np.cumsum([0] + self.windows_per_house)
        self.total_windows = self.cumulative_windows[-1]

        self.sample_to_house = np.empty(self.total_windows, dtype=np.int32)
        self.sample_to_local_idx = np.empty(self.total_windows, dtype=np.int32)
        self.sample_to_day_of_week = np.empty(self.total_windows, dtype=np.int32)
        self.sample_to_day_of_year = np.empty(self.total_windows, dtype=np.int32)

        for house_idx in range(self.num_houses):
            start_global_idx = self.cumulative_windows[house_idx]
            end_global_idx = self.cumulative_windows[house_idx + 1]
            num_windows_for_this_house = self.windows_per_house[house_idx]

            self.sample_to_house[start_global_idx:end_global_idx] = house_idx
            
            local_indices = np.arange(num_windows_for_this_house) * self.step_size
            self.sample_to_local_idx[start_global_idx:end_global_idx] = local_indices

            house_timestamps = pd.Series(timestamps_per_house[house_idx][local_indices])
            self.sample_to_day_of_week[start_global_idx:end_global_idx] = house_timestamps.dt.dayofweek
            self.sample_to_day_of_year[start_global_idx:end_global_idx] = house_timestamps.dt.dayofyear - 1 

        print(f"Dataset initialized. Total windows: {self.total_windows} from {self.num_houses} houses.")
        memory_usage = sum(data.numel() * data.element_size() for data in self.normalized_data_per_house) / 1e6 if self.cache_in_memory else 0
        print(f"Memory usage for cached tensors: {memory_usage:.1f} MB")

    def __len__(self) -> int:
        return self.total_windows

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
        if idx < 0 or idx >= self.total_windows:
            raise IndexError("Index out of range")

        house_index = self.sample_to_house[idx]
        local_start_pos = self.sample_to_local_idx[idx]
        
        window_data = self.normalized_data_per_house[house_index][local_start_pos : local_start_pos + self.window_size]
        
        conditions = {
            "house_id": torch.tensor(house_index, dtype=torch.long),
            "day_of_week": torch.tensor(self.sample_to_day_of_week[idx], dtype=torch.long),
            "day_of_year": torch.tensor(self.sample_to_day_of_year[idx], dtype=torch.long),
        }
        
        return window_data, conditions

    def get_memory_usage(self) -> dict:
        if self.cache_in_memory:
            tensor_memory = sum(data.numel() * data.element_size() for data in self.normalized_data_per_house) / 1e6
        else:
            tensor_memory = 0
        
        mapping_memory = (self.sample_to_house.nbytes + self.sample_to_local_idx.nbytes) / 1e6
        
        return {
            'tensor_cache_mb': tensor_memory,
            'mapping_arrays_mb': mapping_memory,
            'total_mb': tensor_memory + mapping_memory
        }

class LatentDataset(torch.utils.data.Dataset):
    def __init__(self, latent_vectors: torch.Tensor, house_ids: torch.Tensor):
        assert len(latent_vectors) == len(house_ids), "Latent vectors and house IDs must have same length"
        self.latent_vectors = latent_vectors.contiguous()
        self.house_ids = house_ids.contiguous()
        
    def __len__(self) -> int:
        return len(self.latent_vectors)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.latent_vectors[idx], self.house_ids[idx]

if __name__ == "__main__":
    import time
    
    DATA_DIRECTORY = './data/per_house/'
    
    if os.path.exists(DATA_DIRECTORY):
        print("--- Testing Dataset Setup ---")
        
        start_time = time.time()
        dataset = MultiHouseDataset(data_dir=DATA_DIRECTORY, window_size=96, step_size=96)
        init_time = time.time() - start_time
        print(f"Dataset initialization: {init_time:.2f}s")
        print(f"Memory usage: {dataset.get_memory_usage()}")
        
        if len(dataset) > 0:
            first_sample, first_conditions = dataset[0]
            
            print(f"\nSample data shape: {first_sample.shape}")
            print(f"Sample conditions: {first_conditions}")
            print(f"Total houses: {dataset.num_houses}")
    else:
        print(f"ERROR: Data directory not found at '{DATA_DIRECTORY}'. Please create and populate this directory.")