Spaces:

danielvarga
/

pq

Sleeping

pq / data_processing.py

Daniel Varga

uploadable data files doing something, flaky yet

d58b6ea 7 months ago

5.55 kB

	from dataclasses import dataclass
	import numpy as np
	import pandas as pd
	from scipy.interpolate import interp1d


	START = f"2021-01-01"
	END = f"2022-01-01"


	def read_datasets(met_filename, cons_filename, old_dataset=False):
	# old_dataset mode is needed if we plug this into interpolate_and_join()
	# rather than join_consumption_meteorology().

	# Preprocessing meteorologic data
	met_data = pd.read_csv(met_filename, compression='gzip', sep=';', skipinitialspace=True, na_values='n/a', skiprows=[0, 1, 2, 3, 4])
	met_data['Time'] = met_data['Time'].astype(str)
	date_time = met_data['Time'] = pd.to_datetime(met_data['Time'], format='%Y%m%d%H%M')
	met_data = met_data.set_index('Time')

	# Preprocessing consumption data
	cons_data = pd.read_csv(cons_filename, sep='\t', skipinitialspace=True, na_values='n/a', decimal=',')
	cons_data['Time'] = pd.to_datetime(cons_data['Korrigált időpont'], format='%m/%d/%y %H:%M')
	cons_data = cons_data.set_index('Time')
	cons_data['Consumption'] = cons_data['Hatásos teljesítmény [kW]']

	if old_dataset:
	# consumption data is at 14 29 44 59 minutes, we move it by 1 minute
	# to sync it with production data:
	cons_data.index = cons_data.index + pd.DateOffset(minutes=1)

	met_2021_data = met_data[(met_data.index >= START) & (met_data.index < END)]
	cons_2021_data = cons_data[(cons_data.index >= START) & (cons_data.index < END)]

	return met_2021_data, cons_2021_data
	else:
	return met_data, cons_data



	def interpolate(df, target_idx):
	return (df # 1. start with your data
	.reindex(target_idx) # 2. align to the desired timestamps
	.interpolate(method="time") # 3. interpolate within the range
	.ffill().bfill() # 4. forward- and backward-fill anything still missing
	)


	def join_consumption_meteorology(
	met_data: pd.DataFrame,
	cons_data: pd.DataFrame,
	target_freq: str = "5min",
	) -> pd.DataFrame:
	interp_method = "time"

	met = met_data[["Production", "sr", "r", "t", "fs"]]
	cons = cons_data[["Consumption"]]

	cons.index = cons.index + pd.DateOffset(minutes=1)

	start = max(cons.index.min(), met.index.min())
	end = min(cons.index.max(), met.index.max())
	cons = cons.loc[start:end].copy()
	met = met .loc[start:end].copy()

	# there are dupes because of daylight savings time.
	cons = cons[~cons.index.duplicated(keep="last")]

	common_idx = pd.date_range(start, end, freq=target_freq)[:-2]

	cons_interp = interpolate(cons, common_idx)
	met_interp = interpolate(met, common_idx)

	# stitch together
	# joined = pd.concat([cons_interp["Consumption"], met_interp["Production"]], axis=1)
	joined = pd.concat([cons_interp, met_interp], axis=1)

	return joined


	# BESS parameters are now in BatteryModel
	@dataclass
	class SolarParameters:
	solar_cell_num: float = 1140 # units
	solar_efficiency: float = 0.93 * 0.96 # [dimensionless]
	panel_power_at_NOCT: float = 280 # [W]
	# this is the SR (solar radiation) level where panel_power_at_NOCT is produced:
	NOCT_irradiation: float = 800 # [W/m^2]


	# mutates met_2021_data
	def add_production_field(met_2021_data, parameters):
	# sr has dimension W/m^2.
	sr = met_2021_data['sr']

	# TODO use something a bit more fancy nonlinear if we have the temperature anyway.
	nop_total = sr * parameters.solar_cell_num * parameters.solar_efficiency * parameters.panel_power_at_NOCT / parameters.NOCT_irradiation / 1e3
	nop_total = nop_total.clip(0)
	met_2021_data['Production'] = nop_total


	def interpolate_and_join(met_2021_data, cons_2021_data):
	print("this is obsoleted by join_consumption_meteorology(), do not use")
	applicable = 2460365 - 15 + 5

	demand_f = interp1d(range(0, 3652460, 15), cons_2021_data['Consumption'])
	#demand_f = interp1d(range(0, 62460, 15), cons_2021_data['Consumption'])
	demand_interp = demand_f(range(0, applicable, 5))

	production_f = interp1d(range(0, 3652460, 10), met_2021_data['Production'])
	#production_f = interp1d(range(0, 62460, 10), met_2021_data['Production'])
	production_interp = production_f(range(0, applicable, 5))

	all_2021_datetimeindex = pd.date_range(start=START, end=END, freq='5min')[:len(production_interp)]

	all_2021_data = pd.DataFrame({'Consumption': demand_interp, 'Production': production_interp})
	all_2021_data = all_2021_data.set_index(all_2021_datetimeindex)
	return all_2021_data


	# TODO build a dataframe instead
	def monthly_analysis(results):
	consumptions = []
	for month in range(1, 13):
	start = f"2021-{month:02}-01"
	end = f"2021-{month+1:02}-01"
	if month == 12:
	end = "2022-01-01"
	results_in_month = results[(results.index >= start) & (results.index < end)]

	total = results_in_month['Consumption'].sum()
	network = results_in_month['consumption_from_network'].sum()
	solar = results_in_month['consumption_from_solar'].sum()
	bess = results_in_month['consumption_from_bess'].sum()
	consumptions.append([network, solar, bess])

	consumptions = np.array(consumptions)
	step_in_minutes = results.index.freq.n
	# consumption is given in kW. each tick is step_in_minutes long (5mins, in fact)
	# we get consumption in kWh if we multiply sum by step_in_minutes/60
	consumptions_in_mwh = consumptions * (step_in_minutes / 60) / 1000
	return consumptions_in_mwh