| import os | |
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| from torch.utils.data import Dataset, DataLoader | |
| from sklearn.preprocessing import StandardScaler | |
| class WaterDataset(Dataset): | |
| def __init__(self, sequence_length=5, transform=None): | |
| """ | |
| Initializes the dataset by loading LUC, population, and usage data, merging them | |
| based on year and state, and creating sequences of data for training. | |
| Args: | |
| sequence_length (int): The length of each data sequence for time series forecasting. | |
| transform (callable, optional): Optional transform to be applied on a sample. | |
| """ | |
| self.sequence_length = sequence_length | |
| self.luc = pd.read_csv('data/luc.csv') | |
| self.population = pd.read_csv('data/population.csv') | |
| self.usage = pd.read_csv('data/usage.csv') | |
| self.transform = transform | |
| self.years = sorted(set(self.usage['Year'])) | |
| self.states = sorted(set(self.usage['State'])) | |
| self.all_years = sorted(set(self.population['Year'])) | |
| self.df = self.merge_data() | |
| self.x, self.y = self.create_sequence() | |
| self.scaler = StandardScaler() | |
| self.x = self.scaler.fit_transform(self.x.reshape(-1, self.x.shape[-1])).reshape(self.x.shape) | |
| def merge_data(self): | |
| """ | |
| Merges land use classification (LUC) and population data based on year and state. | |
| Returns: | |
| pd.DataFrame: A DataFrame with merged data on population, urban/rural breakdown, | |
| and LUC attributes for each year and state. | |
| """ | |
| merged_data = [] | |
| for year, state in [(y, s) for y in self.all_years for s in self.states]: | |
| population_data = self.population[(self.population['Year'] == year)] | |
| luc_data = self.luc[(self.luc['Year'] == year) & (self.luc['State'] == state)] | |
| if not population_data.empty and not luc_data.empty: | |
| combined_data = { | |
| 'year': year, | |
| 'state': state, | |
| 'population': population_data['Population'].values[0], | |
| 'urban_population': population_data['Urban Population'].values[0], | |
| 'rural_population': population_data['Rural Population'].values[0], | |
| 'forest': luc_data['Forest'].values[0], | |
| 'barren': luc_data['Barren'].values[0], | |
| 'others': luc_data['Others'].values[0], | |
| 'fallow': luc_data['Fallow'].values[0], | |
| 'cropped': luc_data['Cropped'].values[0] | |
| } | |
| merged_data.append(combined_data) | |
| return pd.DataFrame(merged_data) | |
| def create_sequence(self): | |
| """ | |
| Creates sequences of input data and their corresponding labels for training. | |
| Returns: | |
| tuple: Two numpy arrays, one for data sequences and one for label sequences. | |
| """ | |
| data_sequences, label_sequences = [], [] | |
| missing_sequences = {state: [] for state in self.states} | |
| for state in self.states: | |
| state_data = self.df[self.df['state'] == state].sort_values('year') | |
| usage_state_data = self.usage[self.usage['State'] == state] | |
| for i in range(len(state_data) - self.sequence_length): | |
| sequence = state_data.iloc[i:i + self.sequence_length] | |
| year = sequence['year'].values[-1] + 1 | |
| usage_label = usage_state_data[usage_state_data['Year'] == year] | |
| if len(sequence) == self.sequence_length and not usage_label.empty: | |
| data_sequences.append(sequence[['population', 'urban_population', 'rural_population', | |
| 'forest', 'barren', 'others', 'fallow', 'cropped']].values.astype(np.float32)) | |
| label_sequences.append(usage_label[['Domestic', 'Industrial', 'Irrigation']].values[0].astype(np.float32)) | |
| else: | |
| missing_sequences[state].append(year) | |
| return np.array(data_sequences), np.array(label_sequences) | |
| def __len__(self): | |
| return len(self.x) | |
| def __getitem__(self, index): | |
| return (torch.tensor(self.x[index], dtype=torch.float32), | |
| torch.tensor(self.y[index], dtype=torch.float32)) |