| |
| |
| |
| |
|
|
| import os |
| import random |
|
|
| import numpy as np |
| import pandas as pd |
| from sklearn.preprocessing import StandardScaler |
|
|
| non_109_datasets = ["HandOutlines", |
| "NonInvasiveFetalECGThorax1", |
| "NonInvasiveFetalECGThorax2", |
| "AllGestureWiimoteX", |
| "AllGestureWiimoteY", |
| "AllGestureWiimoteZ", |
| "DodgerLoopDay", |
| "DodgerLoopGame", |
| "DodgerLoopWeekend", |
| "Fungi", |
| "GestureMidAirD1", |
| "GestureMidAirD2", |
| "GestureMidAirD3", |
| "GesturePebbleZ1", |
| "GesturePebbleZ2", |
| "MelbournePedestrian", |
| "PickupGestureWiimoteZ", |
| "PLAID", |
| "ShakeGestureWiimoteZ"] |
|
|
| classification_datasets = ["Adiac", |
| "ArrowHead", |
| "Beef", |
| "BeetleFly", |
| "BirdChicken", |
| "Car", |
| "CBF", |
| "ChlorineConcentration", |
| "CinCECGTorso", |
| "Coffee", |
| "Computers", |
| "CricketX", |
| "CricketY", |
| "CricketZ", |
| "DiatomSizeReduction", |
| "DistalPhalanxOutlineAgeGroup", |
| "DistalPhalanxOutlineCorrect", |
| "DistalPhalanxTW", |
| "Earthquakes", |
| "ECG200", |
| "ECG5000", |
| "ECGFiveDays", |
| "ElectricDevices", |
| "FaceAll", |
| "FaceFour", |
| "FacesUCR", |
| "FiftyWords", |
| "Fish", |
| "FordA", |
| "FordB", |
| "GunPoint", |
| "Ham", |
| "HandOutlines", |
| "Haptics", |
| "Herring", |
| "InlineSkate", |
| "InsectWingbeatSound", |
| "ItalyPowerDemand", |
| "LargeKitchenAppliances", |
| "Lightning2", |
| "Lightning7", |
| "Mallat", |
| "Meat", |
| "MedicalImages", |
| "MiddlePhalanxOutlineAgeGroup", |
| "MiddlePhalanxOutlineCorrect", |
| "MiddlePhalanxTW", |
| "MoteStrain", |
| "NonInvasiveFetalECGThorax1", |
| "NonInvasiveFetalECGThorax2", |
| "OliveOil", |
| "OSULeaf", |
| "PhalangesOutlinesCorrect", |
| "Phoneme", |
| "Plane", |
| "ProximalPhalanxOutlineAgeGroup", |
| "ProximalPhalanxOutlineCorrect", |
| "ProximalPhalanxTW", |
| "RefrigerationDevices", |
| "ScreenType", |
| "ShapeletSim", |
| "ShapesAll", |
| "SmallKitchenAppliances", |
| "SonyAIBORobotSurface1", |
| "SonyAIBORobotSurface2", |
| "StarLightCurves", |
| "Strawberry", |
| "SwedishLeaf", |
| "Symbols", |
| "SyntheticControl", |
| "ToeSegmentation1", |
| "ToeSegmentation2", |
| "Trace", |
| "TwoLeadECG", |
| "TwoPatterns", |
| "UWaveGestureLibraryAll", |
| "UWaveGestureLibraryX", |
| "UWaveGestureLibraryY", |
| "UWaveGestureLibraryZ", |
| "Wafer", |
| "Wine", |
| "WordSynonyms", |
| "Worms", |
| "WormsTwoClass", |
| "Yoga", |
| "ACSF1", |
| "AllGestureWiimoteX", |
| "AllGestureWiimoteY", |
| "AllGestureWiimoteZ", |
| "BME", |
| "Chinatown", |
| "Crop", |
| "DodgerLoopDay", |
| "DodgerLoopGame", |
| "DodgerLoopWeekend", |
| "EOGHorizontalSignal", |
| "EOGVerticalSignal", |
| "EthanolLevel", |
| "FreezerRegularTrain", |
| "FreezerSmallTrain", |
| "Fungi", |
| "GestureMidAirD1", |
| "GestureMidAirD2", |
| "GestureMidAirD3", |
| "GesturePebbleZ1", |
| "GesturePebbleZ2", |
| "GunPointAgeSpan", |
| "GunPointMaleVersusFemale", |
| "GunPointOldVersusYoung", |
| "HouseTwenty", |
| "InsectEPGRegularTrain", |
| "InsectEPGSmallTrain", |
| "MelbournePedestrian", |
| "MixedShapesRegularTrain", |
| "MixedShapesSmallTrain", |
| "PickupGestureWiimoteZ", |
| "PigAirwayPressure", |
| "PigArtPressure", |
| "PigCVP", |
| "PLAID", |
| "PowerCons", |
| "Rock", |
| "SemgHandGenderCh2", |
| "SemgHandMovementCh2", |
| "SemgHandSubjectCh2", |
| "ShakeGestureWiimoteZ", |
| "SmoothSubspace", |
| "UMD", |
| ] |
|
|
|
|
| def get_classification_datasets_summary(dataset=None, subset="full"): |
| if subset == "109": |
| if os.path.exists("../data/classification_datasets_109.csv"): |
| df = pd.read_csv("../data/classification_datasets_109.csv") |
| else: |
| df = pd.read_csv(os.getcwd() + "/data/classification_datasets_109.csv") |
| df.columns = [x.strip() for x in df.columns] |
| if dataset is None: |
| return df |
| elif subset == "bakeoff": |
| if os.path.exists("../data/classification_datasets_bakeoff.csv"): |
| df = pd.read_csv("../data/classification_datasets_bakeoff.csv") |
| else: |
| df = pd.read_csv(os.getcwd() + "/data/classification_datasets_bakeoff.csv") |
| df.columns = [x.strip() for x in df.columns] |
| if dataset is None: |
| return df |
| elif subset == "development": |
| if os.path.exists("../data/classification_datasets_development.csv"): |
| df = pd.read_csv("../data/classification_datasets_development.csv") |
| else: |
| df = pd.read_csv(os.getcwd() + "/data/classification_datasets_development.csv") |
| df.columns = [x.strip() for x in df.columns] |
| if dataset is None: |
| return df |
| elif subset == "holdout": |
| if os.path.exists("../data/classification_datasets_development.csv"): |
| df_dev = pd.read_csv("../data/classification_datasets_development.csv") |
| else: |
| df_dev = pd.read_csv(os.getcwd() + "/data/classification_datasets_development.csv") |
| if os.path.exists("../data/classification_datasets_bakeoff.csv"): |
| df = pd.read_csv("../data/classification_datasets_bakeoff.csv") |
| else: |
| df = pd.read_csv(os.getcwd() + "/data/classification_datasets_bakeoff.csv") |
| df = df.loc[~df["Name"].isin(df_dev["Name"])].reset_index(drop=True) |
| df.columns = [x.strip() for x in df.columns] |
| if dataset is None: |
| return df |
| else: |
| if os.path.exists("../data/classification_datasets.csv"): |
| df = pd.read_csv("../data/classification_datasets.csv") |
| else: |
| df = pd.read_csv(os.getcwd() + "/data/classification_datasets.csv") |
| df.columns = [x.strip() for x in df.columns] |
| if dataset is None: |
| return df |
|
|
| return df.loc[df.Name == dataset].reset_index(drop=True) |
|
|
|
|
| def read_univariate_ucr(filename, normalise=True): |
| if "csv" in filename: |
| data = np.loadtxt(filename, delimiter=',') |
| else: |
| data = np.loadtxt(filename, delimiter='\t') |
| Y = data[:, 0] |
| X = data[:, 1:] |
|
|
| scaler = StandardScaler() |
| for i in range(len(X)): |
| for j in range(len(X[i])): |
| if np.isnan(X[i, j]): |
| X[i, j] = random.random() / 1000 |
| |
| if normalise: |
| tmp = scaler.fit_transform(X[i].reshape(-1, 1)) |
| X[i] = tmp[:, 0] |
| X = X.reshape((X.shape[0], X.shape[1], 1)) |
| return X, Y |
|
|
|
|
| def fill_missing(x: np.array, |
| max_len: int, |
| vary_len: str = "suffix-noise", |
| normalise: bool = True): |
| if vary_len == "zero": |
| if normalise: |
| x = StandardScaler().fit_transform(x) |
| x = np.nan_to_num(x) |
| elif vary_len == 'prefix-suffix-noise': |
| for i in range(len(x)): |
| series = list() |
| for a in x[i, :]: |
| if np.isnan(a): |
| break |
| series.append(a) |
| series = np.array(series) |
| seq_len = len(series) |
| diff_len = int(0.5 * (max_len - seq_len)) |
|
|
| for j in range(diff_len): |
| x[i, j] = random.random() / 1000 |
|
|
| for j in range(diff_len, seq_len): |
| x[i, j] = series[j - seq_len] |
|
|
| for j in range(seq_len, max_len): |
| x[i, j] = random.random() / 1000 |
|
|
| if normalise: |
| tmp = StandardScaler().fit_transform(x[i].reshape(-1, 1)) |
| x[i] = tmp[:, 0] |
| elif vary_len == 'uniform-scaling': |
| for i in range(len(x)): |
| series = list() |
| for a in x[i, :]: |
| if np.isnan(a): |
| break |
| series.append(a) |
| series = np.array(series) |
| seq_len = len(series) |
|
|
| for j in range(max_len): |
| scaling_factor = int(j * seq_len / max_len) |
| x[i, j] = series[scaling_factor] |
| if normalise: |
| tmp = StandardScaler().fit_transform(x[i].reshape(-1, 1)) |
| x[i] = tmp[:, 0] |
| else: |
| for i in range(len(x)): |
| for j in range(len(x[i])): |
| if np.isnan(x[i, j]): |
| x[i, j] = random.random() / 1000 |
|
|
| if normalise: |
| tmp = StandardScaler().fit_transform(x[i].reshape(-1, 1)) |
| x[i] = tmp[:, 0] |
|
|
| return x |
|
|
|
|
| def process_ts_data(X, |
| vary_len: str = "suffix-noise", |
| normalise: bool = False): |
| """ |
| This is a function to process the data, i.e. convert dataframe to numpy array |
| :param X: |
| :param normalise: |
| :return: |
| """ |
| num_instances, num_dim = X.shape |
| columns = X.columns |
| max_len = np.max([len(X[columns[0]][i]) for i in range(num_instances)]) |
| output = np.zeros((num_instances, num_dim, max_len), dtype=np.float64) |
|
|
| for i in range(num_dim): |
| for j in range(num_instances): |
| output[j, i, :] = X[columns[i]][j].values |
| output[:, i, :] = fill_missing( |
| output[:, i, :], |
| max_len, |
| vary_len, |
| normalise |
| ) |
|
|
| return output |
|
|