Spaces:
Sleeping
Sleeping
File size: 4,406 Bytes
e69ee07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
class WaterDataset(Dataset):
def __init__(self, sequence_length=5, transform=None):
"""
Initializes the dataset by loading LUC, population, and usage data, merging them
based on year and state, and creating sequences of data for training.
Args:
sequence_length (int): The length of each data sequence for time series forecasting.
transform (callable, optional): Optional transform to be applied on a sample.
"""
self.sequence_length = sequence_length
self.luc = pd.read_csv('data/luc.csv')
self.population = pd.read_csv('data/population.csv')
self.usage = pd.read_csv('data/usage.csv')
self.transform = transform
self.years = sorted(set(self.usage['Year']))
self.states = sorted(set(self.usage['State']))
self.all_years = sorted(set(self.population['Year']))
self.df = self.merge_data()
self.x, self.y = self.create_sequence()
self.scaler = StandardScaler()
self.x = self.scaler.fit_transform(self.x.reshape(-1, self.x.shape[-1])).reshape(self.x.shape)
def merge_data(self):
"""
Merges land use classification (LUC) and population data based on year and state.
Returns:
pd.DataFrame: A DataFrame with merged data on population, urban/rural breakdown,
and LUC attributes for each year and state.
"""
merged_data = []
for year, state in [(y, s) for y in self.all_years for s in self.states]:
population_data = self.population[(self.population['Year'] == year)]
luc_data = self.luc[(self.luc['Year'] == year) & (self.luc['State'] == state)]
if not population_data.empty and not luc_data.empty:
combined_data = {
'year': year,
'state': state,
'population': population_data['Population'].values[0],
'urban_population': population_data['Urban Population'].values[0],
'rural_population': population_data['Rural Population'].values[0],
'forest': luc_data['Forest'].values[0],
'barren': luc_data['Barren'].values[0],
'others': luc_data['Others'].values[0],
'fallow': luc_data['Fallow'].values[0],
'cropped': luc_data['Cropped'].values[0]
}
merged_data.append(combined_data)
return pd.DataFrame(merged_data)
def create_sequence(self):
"""
Creates sequences of input data and their corresponding labels for training.
Returns:
tuple: Two numpy arrays, one for data sequences and one for label sequences.
"""
data_sequences, label_sequences = [], []
missing_sequences = {state: [] for state in self.states}
for state in self.states:
state_data = self.df[self.df['state'] == state].sort_values('year')
usage_state_data = self.usage[self.usage['State'] == state]
for i in range(len(state_data) - self.sequence_length):
sequence = state_data.iloc[i:i + self.sequence_length]
year = sequence['year'].values[-1] + 1
usage_label = usage_state_data[usage_state_data['Year'] == year]
if len(sequence) == self.sequence_length and not usage_label.empty:
data_sequences.append(sequence[['population', 'urban_population', 'rural_population',
'forest', 'barren', 'others', 'fallow', 'cropped']].values.astype(np.float32))
label_sequences.append(usage_label[['Domestic', 'Industrial', 'Irrigation']].values[0].astype(np.float32))
else:
missing_sequences[state].append(year)
return np.array(data_sequences), np.array(label_sequences)
def __len__(self):
return len(self.x)
def __getitem__(self, index):
return (torch.tensor(self.x[index], dtype=torch.float32),
torch.tensor(self.y[index], dtype=torch.float32)) |