Spaces:

Bhuvi20
/

forecast

Paused

forecast / src /data.py

Bhuvanesh24

Added app.py

4e188a6 over 1 year ago

4.41 kB

	import os
	import pandas as pd
	import numpy as np
	import torch
	from torch.utils.data import Dataset, DataLoader
	from sklearn.preprocessing import StandardScaler

	class WaterDataset(Dataset):
	def __init__(self, sequence_length=5, transform=None):
	"""
	Initializes the dataset by loading LUC, population, and usage data, merging them
	based on year and state, and creating sequences of data for training.

	Args:
	sequence_length (int): The length of each data sequence for time series forecasting.
	transform (callable, optional): Optional transform to be applied on a sample.
	"""
	self.sequence_length = sequence_length
	self.luc = pd.read_csv('data/luc.csv')
	self.population = pd.read_csv('data/population.csv')
	self.usage = pd.read_csv('data/usage.csv')
	self.transform = transform

	self.years = sorted(set(self.usage['Year']))
	self.states = sorted(set(self.usage['State']))
	self.all_years = sorted(set(self.population['Year']))

	self.df = self.merge_data()
	self.x, self.y = self.create_sequence()

	self.scaler = StandardScaler()
	self.x = self.scaler.fit_transform(self.x.reshape(-1, self.x.shape[-1])).reshape(self.x.shape)

	def merge_data(self):
	"""
	Merges land use classification (LUC) and population data based on year and state.

	Returns:
	pd.DataFrame: A DataFrame with merged data on population, urban/rural breakdown,
	and LUC attributes for each year and state.
	"""
	merged_data = []

	for year, state in [(y, s) for y in self.all_years for s in self.states]:
	population_data = self.population[(self.population['Year'] == year)]
	luc_data = self.luc[(self.luc['Year'] == year) & (self.luc['State'] == state)]

	if not population_data.empty and not luc_data.empty:
	combined_data = {
	'year': year,
	'state': state,
	'population': population_data['Population'].values[0],
	'urban_population': population_data['Urban Population'].values[0],
	'rural_population': population_data['Rural Population'].values[0],
	'forest': luc_data['Forest'].values[0],
	'barren': luc_data['Barren'].values[0],
	'others': luc_data['Others'].values[0],
	'fallow': luc_data['Fallow'].values[0],
	'cropped': luc_data['Cropped'].values[0]
	}
	merged_data.append(combined_data)

	return pd.DataFrame(merged_data)

	def create_sequence(self):
	"""
	Creates sequences of input data and their corresponding labels for training.

	Returns:
	tuple: Two numpy arrays, one for data sequences and one for label sequences.
	"""
	data_sequences, label_sequences = [], []
	missing_sequences = {state: [] for state in self.states}

	for state in self.states:
	state_data = self.df[self.df['state'] == state].sort_values('year')
	usage_state_data = self.usage[self.usage['State'] == state]

	for i in range(len(state_data) - self.sequence_length):
	sequence = state_data.iloc[i:i + self.sequence_length]
	year = sequence['year'].values[-1] + 1

	usage_label = usage_state_data[usage_state_data['Year'] == year]

	if len(sequence) == self.sequence_length and not usage_label.empty:
	data_sequences.append(sequence[['population', 'urban_population', 'rural_population',
	'forest', 'barren', 'others', 'fallow', 'cropped']].values.astype(np.float32))
	label_sequences.append(usage_label[['Domestic', 'Industrial', 'Irrigation']].values[0].astype(np.float32))
	else:
	missing_sequences[state].append(year)

	return np.array(data_sequences), np.array(label_sequences)

	def __len__(self):
	return len(self.x)

	def __getitem__(self, index):
	return (torch.tensor(self.x[index], dtype=torch.float32),
	torch.tensor(self.y[index], dtype=torch.float32))