hkayabilisim's picture
feature: added inference test
fee4a7e
import pandas as pd
import torch
from torch.utils.data import Dataset
import numpy as np
class ExplorationDataset(Dataset):
def __init__(self, df: pd.DataFrame,
input_cols,
output_cols, transform_dict = None):
super().__init__()
self.df = df
self.input_cols = input_cols
self.output_cols = output_cols
if transform_dict is None:
self.transform_dict = self.transform_fit(df)
else:
self.transform_dict = transform_dict
self.input_transformed = torch.tensor(self.input_transform(df[input_cols]).values).to(torch.float)
self.output_transformed = torch.tensor(self.output_transform(df[output_cols]).values).to(torch.float)
def __getitem__(self, idx: int) -> tuple[torch.tensor, torch.tensor]:
inputs = self.input_transformed[idx]
outputs = self.output_transformed[idx]
return inputs, outputs
def __len__(self) -> int:
return len(self.df)
def transform_fit(self, df):
transform_dict = {}
for f in self.input_cols + self.output_cols:
if f in ['replica','cpu', 'heap']:
shift = df[f].median()
divide = 1
logtransform = False
elif f in ['expected_tps']:
shift = df[f].mean()
divide = df[f].std()
logtransform = False
elif f in ['num_request']:
shift = 0
divide = df[f].max()
logtransform = False
elif f in ['response_time']:
# shift/divide is only after log
logtransform = True
shift = 0
divide = np.max(np.log10(df[f]))
else:
shift = 0
divide = 1
logtransform = False
transform_dict[f] = {'shift': shift, 'divide': divide,'logtransform': logtransform}
return transform_dict
def transform(self, df, cols):
df_transform = df.copy(deep=True)
for f in df.columns:
if f in cols:
shift = self.transform_dict[f]['shift']
divide = self.transform_dict[f]['divide']
logtransform = self.transform_dict[f]['logtransform']
if logtransform:
df_transform[f] = np.log10(df_transform[f])
df_transform[f] = (df_transform[f] - shift) / divide
return df_transform
def inv_transform(self, df, cols):
df_transform = df.copy(deep=True)
for f in df.columns:
if f in cols:
shift = self.transform_dict[f]['shift']
divide = self.transform_dict[f]['divide']
logtransform = self.transform_dict[f]['logtransform']
df_transform[f] = divide * df_transform[f] + shift
if logtransform:
df_transform[f] = np.power(df_transform[f], 10)
return df_transform
def input_transform(self, df: pd.DataFrame) -> pd.DataFrame:
return self.transform(df, self.input_cols)
def output_transform(self, df: pd.DataFrame) -> pd.DataFrame:
return self.transform(df, self.output_cols)
def input_inv_transform(self, df: pd.DataFrame) -> pd.DataFrame:
return self.inv_transform(df, self.input_cols)
def output_inv_transform(self, df: pd.DataFrame) -> pd.DataFrame:
return self.inv_transform(df, self.output_cols)
def data_replica_vs_cpu_usage():
'''Replica versus cpu_usage'''
df = pd.read_csv('averaged_full_state_data.csv')
df.query('cpu == 5 and expected_tps == 88')
input_cols = ['replica']
output_cols = ['cpu_usage']
other_cols = []
df = df[input_cols + output_cols + other_cols]
return df, input_cols, output_cols, other_cols