# Chang Wei Tan, Angus Dempster, Christoph Bergmeir, Geoffrey I Webb # # MultiRocket: Multiple pooling operators and transformations for fast and effective time series classification # https://arxiv.org/abs/2102.00457 import os import random import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler non_109_datasets = ["HandOutlines", "NonInvasiveFetalECGThorax1", "NonInvasiveFetalECGThorax2", "AllGestureWiimoteX", "AllGestureWiimoteY", "AllGestureWiimoteZ", "DodgerLoopDay", "DodgerLoopGame", "DodgerLoopWeekend", "Fungi", "GestureMidAirD1", "GestureMidAirD2", "GestureMidAirD3", "GesturePebbleZ1", "GesturePebbleZ2", "MelbournePedestrian", "PickupGestureWiimoteZ", "PLAID", "ShakeGestureWiimoteZ"] classification_datasets = ["Adiac", # 390,391,37,176,0.3887,0.3913 (3),0.3964 "ArrowHead", # 36,175,3,251,0.2,0.2000 (0),0.2971 "Beef", # 30,30,5,470,0.3333,0.3333 (0),0.3667 "BeetleFly", # 20,20,2,512,0.25,0.3000 (7),0.3 "BirdChicken", # 20,20,2,512,0.45,0.3000 (6),0.25 "Car", # 60,60,4,577,0.2667,0.2333 (1),0.2667 "CBF", # 30,900,3,128,0.1478,0.0044 (11),0.0033 "ChlorineConcentration", # 467,3840,3,166,0.35,0.3500 (0),0.3516 "CinCECGTorso", # 40,1380,4,1639,0.1029,0.0696 (1),0.3493 "Coffee", # 28,28,2,286,0,0.0000 (0),0 "Computers", # 250,250,2,720,0.424,0.3800 (12),0.3 "CricketX", # 390,390,12,300,0.4231,0.2282 (10),0.2462 "CricketY", # 390,390,12,300,0.4333,0.2410 (17),0.2564 "CricketZ", # 390,390,12,300,0.4128,0.2538 (5),0.2462 "DiatomSizeReduction", # 16,306,4,345,0.0654,0.0654 (0),0.0327 "DistalPhalanxOutlineAgeGroup", # 400,139,3,80,0.3741,0.3741 (0),0.2302 "DistalPhalanxOutlineCorrect", # 600,276,2,80,0.2826,0.2754 (1),0.2826 "DistalPhalanxTW", # 400,139,6,80,0.3669,0.3669 (0),0.4101 "Earthquakes", # 322,139,2,512,0.2878,0.2734 (6),0.2806 "ECG200", # 100,100,2,96,0.12,0.1200 (0),0.23 "ECG5000", # 500,4500,5,140,0.0751,0.0749 (1),0.0756 "ECGFiveDays", # 23,861,2,136,0.2033,0.2033 (0),0.2323 "ElectricDevices", # 8926,7711,7,96,0.4492,0.3806 (14),0.3988 "FaceAll", # 560,1690,14,131,0.2864,0.1917 (3),0.1923 "FaceFour", # 24,88,4,350,0.2159,0.1136 (2),0.1705 "FacesUCR", # 200,2050,14,131,0.2307,0.0878 (12),0.0951 "FiftyWords", # 450,455,50,270,0.3692,0.2418 (6),0.3099 "Fish", # 175,175,7,463,0.2171,0.1543 (4),0.1771 "FordA", # 3601,1320,2,500,0.3348,0.3091 (1),0.4455 "FordB", # 3636,810,2,500,0.3938,0.3926 (1),0.3802 "GunPoint", # 50,150,2,150,0.0867,0.0867 (0) ,0.0933 "Ham", # 109,105,2,431,0.4,0.4000 (0),0.5333 "HandOutlines", # 1000,370,2,2709,0.1378,0.1378 (0),0.1189 "Haptics", # 155,308,5,1092,0.6299,0.5877 (2),0.6234 "Herring", # 64,64,2,512,0.4844,0.4688 (5),0.4688 "InlineSkate", # 100,550,7,1882,0.6582,0.6127 (14),0.6164 "InsectWingbeatSound", # 220,1980,11,256,0.4384,0.4152 (1),0.6449 "ItalyPowerDemand", # 67,1029,2,24,0.0447,0.0447 (0),0.0496 "LargeKitchenAppliances", # 375,375,3,720,0.5067,0.2053 (94),0.2053 "Lightning2", # 60,61,2,637,0.2459,0.1311 (6),0.1311 "Lightning7", # 70,73,7,319,0.4247,0.2877 (5),0.274 "Mallat", # 55,2345,8,1024,0.0857,0.0857 (0),0.0661 "Meat", # 60,60,3,448,0.0667,0.0667 (0),0.0667 "MedicalImages", # 381,760,10,99,0.3158,0.2526 (20),0.2632 "MiddlePhalanxOutlineAgeGroup", # 400,154,3,80,0.4805,0.4805 (0),0.5 "MiddlePhalanxOutlineCorrect", # 600,291,2,80,0.2337,0.2337 (0),0.3024 "MiddlePhalanxTW", # 399,154,6,80,0.487,0.4935 (3),0.4935 "MoteStrain", # 20,1252,2,84,0.1214,0.1342 (1),0.1653 "NonInvasiveFetalECGThorax1", # 1800,1965,42,750,0.171,0.1893 (1),0.2097 "NonInvasiveFetalECGThorax2", # 1800,1965,42,750,0.1201,0.1290 (1),0.1354 "OliveOil", # 30,30,4,570,0.1333,0.1333 (0),0.1667 "OSULeaf", # 200,242,6,427,0.4793,0.3884 (7),0.4091 "PhalangesOutlinesCorrect", # 1800,858,2,80,0.2389,0.2389 (0),0.2716 "Phoneme", # 214,1896,39,1024,0.8908,0.7727 (14),0.7716 "Plane", # 105,105,7,144,0.0381,0.0000 (5),0 "ProximalPhalanxOutlineAgeGroup", # 400,205,3,80,0.2146,0.2146 (0),0.1951 "ProximalPhalanxOutlineCorrect", # 600,291,2,80,0.1924,0.2096 (1),0.2165 "ProximalPhalanxTW", # 400,205,6,80,0.2927,0.2439 (2),0.2439 "RefrigerationDevices", # 375,375,3,720,0.6053,0.5600 (8),0.536 "ScreenType", # 375,375,3,720,0.64,0.5893 (17),0.6027 "ShapeletSim", # 20,180,2,500,0.4611,0.3000 (3),0.35 "ShapesAll", # 600,600,60,512,0.2483,0.1980 (4),0.2317 "SmallKitchenAppliances", # 375,375,3,720,0.6587,0.3280 (15),0.3573 "SonyAIBORobotSurface1", # 20,601,2,70,0.3045,0.3045 (0),0.2745 "SonyAIBORobotSurface2", # 27,953,2,65,0.1406,0.1406 (0),0.1689 "StarLightCurves", # 1000,8236,3,1024,0.1512,0.0947 (16),0.0934 "Strawberry", # 613,370,2,235,0.0541,0.0541 (0),0.0595 "SwedishLeaf", # 500,625,15,128,0.2112,0.1536 (2),0.208 "Symbols", # 25,995,6,398,0.1005,0.0623 (8),0.0503 "SyntheticControl", # 300,300,6,60,0.12,0.0167 (6),0.0067 "ToeSegmentation1", # 40,228,2,277,0.3202,0.2500 (8),0.2281 "ToeSegmentation2", # 36,130,2,343,0.1923,0.0923 (5),0.1615 "Trace", # 100,100,4,275,0.24,0.0100 (3),0 "TwoLeadECG", # 23,1139,2,82,0.2529,0.1317 (4),0.0957 "TwoPatterns", # 1000,4000,4,128,0.0932,0.0015 (4),0 "UWaveGestureLibraryAll", # 896,3582,8,945,0.0519,0.0343 (4),0.1083 "UWaveGestureLibraryX", # 896,3582,8,315,0.2607,0.2267 (4),0.2725 "UWaveGestureLibraryY", # 896,3582,8,315,0.338,0.3009 (4),0.366 "UWaveGestureLibraryZ", # 896,3582,8,315,0.3504,0.3222 (6),0.3417 "Wafer", # 1000,6164,2,152,0.0045,0.0045 (1),0.0201 "Wine", # 57,54,2,234,0.3889,0.3889 (0),0.4259 "WordSynonyms", # 267,638,25,270,0.3824,0.2618 (9),0.3511 "Worms", # 181,77,5,900,0.5455,0.4675 (9),0.4156 "WormsTwoClass", # 181,77,2,900,0.3896,0.4156 (7),0.3766 "Yoga", # 300,3000,2,426,0.1697,0.1560 (7),0.1637 "ACSF1", # 100,100,10,1460,0.46,0.3800 (4),0.36 "AllGestureWiimoteX", # 300,700,10,Vary,0.4843, 0.2829 (14),0.2843 "AllGestureWiimoteY", # 300,700,10,Vary,0.4314, 0.2700 (9),0.2714 "AllGestureWiimoteZ", # 300,700,10,Vary,0.5457,0.3486 (11),0.3571 "BME", # 30,150,3,128,0.1667,0.0200 (4),0.1 "Chinatown", # 20,345,2,24,0.0464,0.0464 (0),0.0435 "Crop", # 7200,16800,24,46,0.2883,0.2883 (0),0.3348 "DodgerLoopDay", # 78,80,7,288,0.45, 0.4125 (1),0.5 "DodgerLoopGame", # 20,138,2,288,0.1159, 0.0725 (1),0.1232 "DodgerLoopWeekend", # 20,138,2,288,0.0145, 0.0217 (1),0.0507 "EOGHorizontalSignal", # 362,362,12,1250,0.5829, 0.5249 (1),0.4972 "EOGVerticalSignal", # 362,362,12,1250,0.558, 0.5249 (2),0.5525 "EthanolLevel", # 504,500,4,1751,0.726,0.7180 (1),0.724 "FreezerRegularTrain", # 150,2850,2,301,0.1951,0.0930 (1),0.1011 "FreezerSmallTrain", # 28,2850,2,301,0.3302,0.3302 (0),0.2467 "Fungi", # 18,186,18,201,0.1774,0.1774 (0),0.1613 "GestureMidAirD1", # 208,130,26,Vary,0.4231, 0.3615 (5),0.4308 "GestureMidAirD2", # 208,130,26,Vary,0.5077, 0.4000 (6),0.3923 "GestureMidAirD3", # 208,130,26,Vary,0.6538, 0.6231 (1),0.6769 "GesturePebbleZ1", # 132,172,6,Vary,0.2674,0.1744 (2),0.2093 "GesturePebbleZ2", # 146,158,6,Vary,0.3291,0.2215 (6),0.3291 "GunPointAgeSpan", # 135,316,2,150,0.1013,0.0348 (3),0.0823 "GunPointMaleVersusFemale", # 135,316,2,150,0.0253,0.0253 (0),0.0032 "GunPointOldVersusYoung", # 135,316,2,150,0.0476,0.0349 (4),0.1619 "HouseTwenty", # 40,119,2,2000,0.3361, 0.0588 (33),0.0756 "InsectEPGRegularTrain", # 62,249,3,601,0.3213,0.1727 (11),0.1285 "InsectEPGSmallTrain", # 17,249,3,601,0.3373,0.3052 (1),0.2651 "MelbournePedestrian", # 1200,2450,10,24,0.1518,0.1518 (0),0.2094 "MixedShapesRegularTrain", # 500,2425,5,1024,0.1027, 0.0911 (4),0.1584 "MixedShapesSmallTrain", # 100,2425,5,1024,0.1645, 0.1674 (7),0.2202 "PickupGestureWiimoteZ", # 50,50,10,Vary,0.44,0.3400 (17),0.34 "PigAirwayPressure", # 104,208,52,2000,0.9423,0.9038 (1),0.8942 "PigArtPressure", # 104,208,52,2000,0.875,0.8029 (1),0.7548 "PigCVP", # 104,208,52,2000,0.9183,0.8413 (11),0.8462 "PLAID", # 537,537,11,Vary,0.4786,0.1862 (3),0.1601 "PowerCons", # 180,180,2,144,0.0667,0.0778 (3),0.1222 "Rock", # 20,50,4,2844,0.16, 0.1600 (0),0.4 "SemgHandGenderCh2", # 300,600,2,1500,0.2383,0.1550 (1),0.1983 "SemgHandMovementCh2", # 450,450,6,1500,0.6311,0.3622 (1),0.4156 "SemgHandSubjectCh2", # 450,450,5,1500,0.5956,0.2000 (3),0.2733 "ShakeGestureWiimoteZ", # 50,50,10,Vary,0.4,0.1600 (6),0.14 "SmoothSubspace", # 150,150,3,15,0.0933,0.0533 (1),0.1733 "UMD", # 36,144,3,150,0.2361,0.0278 (6),0.0069 ] def get_classification_datasets_summary(dataset=None, subset="full"): if subset == "109": if os.path.exists("../data/classification_datasets_109.csv"): df = pd.read_csv("../data/classification_datasets_109.csv") else: df = pd.read_csv(os.getcwd() + "/data/classification_datasets_109.csv") df.columns = [x.strip() for x in df.columns] if dataset is None: return df elif subset == "bakeoff": if os.path.exists("../data/classification_datasets_bakeoff.csv"): df = pd.read_csv("../data/classification_datasets_bakeoff.csv") else: df = pd.read_csv(os.getcwd() + "/data/classification_datasets_bakeoff.csv") df.columns = [x.strip() for x in df.columns] if dataset is None: return df elif subset == "development": if os.path.exists("../data/classification_datasets_development.csv"): df = pd.read_csv("../data/classification_datasets_development.csv") else: df = pd.read_csv(os.getcwd() + "/data/classification_datasets_development.csv") df.columns = [x.strip() for x in df.columns] if dataset is None: return df elif subset == "holdout": if os.path.exists("../data/classification_datasets_development.csv"): df_dev = pd.read_csv("../data/classification_datasets_development.csv") else: df_dev = pd.read_csv(os.getcwd() + "/data/classification_datasets_development.csv") if os.path.exists("../data/classification_datasets_bakeoff.csv"): df = pd.read_csv("../data/classification_datasets_bakeoff.csv") else: df = pd.read_csv(os.getcwd() + "/data/classification_datasets_bakeoff.csv") df = df.loc[~df["Name"].isin(df_dev["Name"])].reset_index(drop=True) df.columns = [x.strip() for x in df.columns] if dataset is None: return df else: if os.path.exists("../data/classification_datasets.csv"): df = pd.read_csv("../data/classification_datasets.csv") else: df = pd.read_csv(os.getcwd() + "/data/classification_datasets.csv") df.columns = [x.strip() for x in df.columns] if dataset is None: return df return df.loc[df.Name == dataset].reset_index(drop=True) def read_univariate_ucr(filename, normalise=True): if "csv" in filename: data = np.loadtxt(filename, delimiter=',') else: data = np.loadtxt(filename, delimiter='\t') Y = data[:, 0] X = data[:, 1:] scaler = StandardScaler() for i in range(len(X)): for j in range(len(X[i])): if np.isnan(X[i, j]): X[i, j] = random.random() / 1000 # scale it later if normalise: tmp = scaler.fit_transform(X[i].reshape(-1, 1)) X[i] = tmp[:, 0] X = X.reshape((X.shape[0], X.shape[1], 1)) return X, Y def fill_missing(x: np.array, max_len: int, vary_len: str = "suffix-noise", normalise: bool = True): if vary_len == "zero": if normalise: x = StandardScaler().fit_transform(x) x = np.nan_to_num(x) elif vary_len == 'prefix-suffix-noise': for i in range(len(x)): series = list() for a in x[i, :]: if np.isnan(a): break series.append(a) series = np.array(series) seq_len = len(series) diff_len = int(0.5 * (max_len - seq_len)) for j in range(diff_len): x[i, j] = random.random() / 1000 for j in range(diff_len, seq_len): x[i, j] = series[j - seq_len] for j in range(seq_len, max_len): x[i, j] = random.random() / 1000 if normalise: tmp = StandardScaler().fit_transform(x[i].reshape(-1, 1)) x[i] = tmp[:, 0] elif vary_len == 'uniform-scaling': for i in range(len(x)): series = list() for a in x[i, :]: if np.isnan(a): break series.append(a) series = np.array(series) seq_len = len(series) for j in range(max_len): scaling_factor = int(j * seq_len / max_len) x[i, j] = series[scaling_factor] if normalise: tmp = StandardScaler().fit_transform(x[i].reshape(-1, 1)) x[i] = tmp[:, 0] else: for i in range(len(x)): for j in range(len(x[i])): if np.isnan(x[i, j]): x[i, j] = random.random() / 1000 if normalise: tmp = StandardScaler().fit_transform(x[i].reshape(-1, 1)) x[i] = tmp[:, 0] return x def process_ts_data(X, vary_len: str = "suffix-noise", normalise: bool = False): """ This is a function to process the data, i.e. convert dataframe to numpy array :param X: :param normalise: :return: """ num_instances, num_dim = X.shape columns = X.columns max_len = np.max([len(X[columns[0]][i]) for i in range(num_instances)]) output = np.zeros((num_instances, num_dim, max_len), dtype=np.float64) for i in range(num_dim): for j in range(num_instances): output[j, i, :] = X[columns[i]][j].values output[:, i, :] = fill_missing( output[:, i, :], max_len, vary_len, normalise ) return output