File size: 17,795 Bytes

556d303

# Chang Wei Tan, Angus Dempster, Christoph Bergmeir, Geoffrey I Webb
#
# MultiRocket: Multiple pooling operators and transformations for fast and effective time series classification
# https://arxiv.org/abs/2102.00457

import os
import random

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

non_109_datasets = ["HandOutlines",
                    "NonInvasiveFetalECGThorax1",
                    "NonInvasiveFetalECGThorax2",
                    "AllGestureWiimoteX",
                    "AllGestureWiimoteY",
                    "AllGestureWiimoteZ",
                    "DodgerLoopDay",
                    "DodgerLoopGame",
                    "DodgerLoopWeekend",
                    "Fungi",
                    "GestureMidAirD1",
                    "GestureMidAirD2",
                    "GestureMidAirD3",
                    "GesturePebbleZ1",
                    "GesturePebbleZ2",
                    "MelbournePedestrian",
                    "PickupGestureWiimoteZ",
                    "PLAID",
                    "ShakeGestureWiimoteZ"]

classification_datasets = ["Adiac",  # 390,391,37,176,0.3887,0.3913 (3),0.3964
                           "ArrowHead",  # 36,175,3,251,0.2,0.2000 (0),0.2971
                           "Beef",  # 30,30,5,470,0.3333,0.3333 (0),0.3667
                           "BeetleFly",  # 20,20,2,512,0.25,0.3000 (7),0.3
                           "BirdChicken",  # 20,20,2,512,0.45,0.3000 (6),0.25
                           "Car",  # 60,60,4,577,0.2667,0.2333 (1),0.2667
                           "CBF",  # 30,900,3,128,0.1478,0.0044 (11),0.0033
                           "ChlorineConcentration",  # 467,3840,3,166,0.35,0.3500 (0),0.3516
                           "CinCECGTorso",  # 40,1380,4,1639,0.1029,0.0696 (1),0.3493
                           "Coffee",  # 28,28,2,286,0,0.0000 (0),0
                           "Computers",  # 250,250,2,720,0.424,0.3800 (12),0.3
                           "CricketX",  # 390,390,12,300,0.4231,0.2282 (10),0.2462
                           "CricketY",  # 390,390,12,300,0.4333,0.2410 (17),0.2564
                           "CricketZ",  # 390,390,12,300,0.4128,0.2538 (5),0.2462
                           "DiatomSizeReduction",  # 16,306,4,345,0.0654,0.0654 (0),0.0327
                           "DistalPhalanxOutlineAgeGroup",  # 400,139,3,80,0.3741,0.3741 (0),0.2302
                           "DistalPhalanxOutlineCorrect",  # 600,276,2,80,0.2826,0.2754 (1),0.2826
                           "DistalPhalanxTW",  # 400,139,6,80,0.3669,0.3669 (0),0.4101
                           "Earthquakes",  # 322,139,2,512,0.2878,0.2734 (6),0.2806
                           "ECG200",  # 100,100,2,96,0.12,0.1200 (0),0.23
                           "ECG5000",  # 500,4500,5,140,0.0751,0.0749 (1),0.0756
                           "ECGFiveDays",  # 23,861,2,136,0.2033,0.2033 (0),0.2323
                           "ElectricDevices",  # 8926,7711,7,96,0.4492,0.3806 (14),0.3988
                           "FaceAll",  # 560,1690,14,131,0.2864,0.1917 (3),0.1923
                           "FaceFour",  # 24,88,4,350,0.2159,0.1136 (2),0.1705
                           "FacesUCR",  # 200,2050,14,131,0.2307,0.0878 (12),0.0951
                           "FiftyWords",  # 450,455,50,270,0.3692,0.2418 (6),0.3099
                           "Fish",  # 175,175,7,463,0.2171,0.1543 (4),0.1771
                           "FordA",  # 3601,1320,2,500,0.3348,0.3091 (1),0.4455
                           "FordB",  # 3636,810,2,500,0.3938,0.3926 (1),0.3802
                           "GunPoint",  # 50,150,2,150,0.0867,0.0867 (0) ,0.0933
                           "Ham",  # 109,105,2,431,0.4,0.4000 (0),0.5333
                           "HandOutlines",  # 1000,370,2,2709,0.1378,0.1378 (0),0.1189
                           "Haptics",  # 155,308,5,1092,0.6299,0.5877 (2),0.6234
                           "Herring",  # 64,64,2,512,0.4844,0.4688 (5),0.4688
                           "InlineSkate",  # 100,550,7,1882,0.6582,0.6127 (14),0.6164
                           "InsectWingbeatSound",  # 220,1980,11,256,0.4384,0.4152 (1),0.6449
                           "ItalyPowerDemand",  # 67,1029,2,24,0.0447,0.0447 (0),0.0496
                           "LargeKitchenAppliances",  # 375,375,3,720,0.5067,0.2053 (94),0.2053
                           "Lightning2",  # 60,61,2,637,0.2459,0.1311 (6),0.1311
                           "Lightning7",  # 70,73,7,319,0.4247,0.2877 (5),0.274
                           "Mallat",  # 55,2345,8,1024,0.0857,0.0857 (0),0.0661
                           "Meat",  # 60,60,3,448,0.0667,0.0667 (0),0.0667
                           "MedicalImages",  # 381,760,10,99,0.3158,0.2526 (20),0.2632
                           "MiddlePhalanxOutlineAgeGroup",  # 400,154,3,80,0.4805,0.4805 (0),0.5
                           "MiddlePhalanxOutlineCorrect",  # 600,291,2,80,0.2337,0.2337 (0),0.3024
                           "MiddlePhalanxTW",  # 399,154,6,80,0.487,0.4935 (3),0.4935
                           "MoteStrain",  # 20,1252,2,84,0.1214,0.1342 (1),0.1653
                           "NonInvasiveFetalECGThorax1",  # 1800,1965,42,750,0.171,0.1893 (1),0.2097
                           "NonInvasiveFetalECGThorax2",  # 1800,1965,42,750,0.1201,0.1290 (1),0.1354
                           "OliveOil",  # 30,30,4,570,0.1333,0.1333 (0),0.1667
                           "OSULeaf",  # 200,242,6,427,0.4793,0.3884 (7),0.4091
                           "PhalangesOutlinesCorrect",  # 1800,858,2,80,0.2389,0.2389 (0),0.2716
                           "Phoneme",  # 214,1896,39,1024,0.8908,0.7727 (14),0.7716
                           "Plane",  # 105,105,7,144,0.0381,0.0000 (5),0
                           "ProximalPhalanxOutlineAgeGroup",  # 400,205,3,80,0.2146,0.2146 (0),0.1951
                           "ProximalPhalanxOutlineCorrect",  # 600,291,2,80,0.1924,0.2096 (1),0.2165
                           "ProximalPhalanxTW",  # 400,205,6,80,0.2927,0.2439 (2),0.2439
                           "RefrigerationDevices",  # 375,375,3,720,0.6053,0.5600 (8),0.536
                           "ScreenType",  # 375,375,3,720,0.64,0.5893 (17),0.6027
                           "ShapeletSim",  # 20,180,2,500,0.4611,0.3000 (3),0.35
                           "ShapesAll",  # 600,600,60,512,0.2483,0.1980 (4),0.2317
                           "SmallKitchenAppliances",  # 375,375,3,720,0.6587,0.3280 (15),0.3573
                           "SonyAIBORobotSurface1",  # 20,601,2,70,0.3045,0.3045 (0),0.2745
                           "SonyAIBORobotSurface2",  # 27,953,2,65,0.1406,0.1406 (0),0.1689
                           "StarLightCurves",  # 1000,8236,3,1024,0.1512,0.0947 (16),0.0934
                           "Strawberry",  # 613,370,2,235,0.0541,0.0541 (0),0.0595
                           "SwedishLeaf",  # 500,625,15,128,0.2112,0.1536 (2),0.208
                           "Symbols",  # 25,995,6,398,0.1005,0.0623 (8),0.0503
                           "SyntheticControl",  # 300,300,6,60,0.12,0.0167 (6),0.0067
                           "ToeSegmentation1",  # 40,228,2,277,0.3202,0.2500 (8),0.2281
                           "ToeSegmentation2",  # 36,130,2,343,0.1923,0.0923 (5),0.1615
                           "Trace",  # 100,100,4,275,0.24,0.0100 (3),0
                           "TwoLeadECG",  # 23,1139,2,82,0.2529,0.1317 (4),0.0957
                           "TwoPatterns",  # 1000,4000,4,128,0.0932,0.0015 (4),0
                           "UWaveGestureLibraryAll",  # 896,3582,8,945,0.0519,0.0343 (4),0.1083
                           "UWaveGestureLibraryX",  # 896,3582,8,315,0.2607,0.2267 (4),0.2725
                           "UWaveGestureLibraryY",  # 896,3582,8,315,0.338,0.3009 (4),0.366
                           "UWaveGestureLibraryZ",  # 896,3582,8,315,0.3504,0.3222 (6),0.3417
                           "Wafer",  # 1000,6164,2,152,0.0045,0.0045 (1),0.0201
                           "Wine",  # 57,54,2,234,0.3889,0.3889 (0),0.4259
                           "WordSynonyms",  # 267,638,25,270,0.3824,0.2618 (9),0.3511
                           "Worms",  # 181,77,5,900,0.5455,0.4675 (9),0.4156
                           "WormsTwoClass",  # 181,77,2,900,0.3896,0.4156 (7),0.3766
                           "Yoga",  # 300,3000,2,426,0.1697,0.1560 (7),0.1637
                           "ACSF1",  # 100,100,10,1460,0.46,0.3800 (4),0.36
                           "AllGestureWiimoteX",  # 300,700,10,Vary,0.4843, 0.2829 (14),0.2843
                           "AllGestureWiimoteY",  # 300,700,10,Vary,0.4314, 0.2700 (9),0.2714
                           "AllGestureWiimoteZ",  # 300,700,10,Vary,0.5457,0.3486 (11),0.3571
                           "BME",  # 30,150,3,128,0.1667,0.0200 (4),0.1
                           "Chinatown",  # 20,345,2,24,0.0464,0.0464 (0),0.0435
                           "Crop",  # 7200,16800,24,46,0.2883,0.2883 (0),0.3348
                           "DodgerLoopDay",  # 78,80,7,288,0.45, 0.4125 (1),0.5
                           "DodgerLoopGame",  # 20,138,2,288,0.1159, 0.0725 (1),0.1232
                           "DodgerLoopWeekend",  # 20,138,2,288,0.0145, 0.0217 (1),0.0507
                           "EOGHorizontalSignal",  # 362,362,12,1250,0.5829, 0.5249 (1),0.4972
                           "EOGVerticalSignal",  # 362,362,12,1250,0.558, 0.5249 (2),0.5525
                           "EthanolLevel",  # 504,500,4,1751,0.726,0.7180 (1),0.724
                           "FreezerRegularTrain",  # 150,2850,2,301,0.1951,0.0930 (1),0.1011
                           "FreezerSmallTrain",  # 28,2850,2,301,0.3302,0.3302 (0),0.2467
                           "Fungi",  # 18,186,18,201,0.1774,0.1774 (0),0.1613
                           "GestureMidAirD1",  # 208,130,26,Vary,0.4231, 0.3615 (5),0.4308
                           "GestureMidAirD2",  # 208,130,26,Vary,0.5077, 0.4000 (6),0.3923
                           "GestureMidAirD3",  # 208,130,26,Vary,0.6538, 0.6231 (1),0.6769
                           "GesturePebbleZ1",  # 132,172,6,Vary,0.2674,0.1744 (2),0.2093
                           "GesturePebbleZ2",  # 146,158,6,Vary,0.3291,0.2215 (6),0.3291
                           "GunPointAgeSpan",  # 135,316,2,150,0.1013,0.0348 (3),0.0823
                           "GunPointMaleVersusFemale",  # 135,316,2,150,0.0253,0.0253 (0),0.0032
                           "GunPointOldVersusYoung",  # 135,316,2,150,0.0476,0.0349 (4),0.1619
                           "HouseTwenty",  # 40,119,2,2000,0.3361, 0.0588 (33),0.0756
                           "InsectEPGRegularTrain",  # 62,249,3,601,0.3213,0.1727 (11),0.1285
                           "InsectEPGSmallTrain",  # 17,249,3,601,0.3373,0.3052 (1),0.2651
                           "MelbournePedestrian",  # 1200,2450,10,24,0.1518,0.1518 (0),0.2094
                           "MixedShapesRegularTrain",  # 500,2425,5,1024,0.1027, 0.0911 (4),0.1584
                           "MixedShapesSmallTrain",  # 100,2425,5,1024,0.1645, 0.1674 (7),0.2202
                           "PickupGestureWiimoteZ",  # 50,50,10,Vary,0.44,0.3400 (17),0.34
                           "PigAirwayPressure",  # 104,208,52,2000,0.9423,0.9038 (1),0.8942
                           "PigArtPressure",  # 104,208,52,2000,0.875,0.8029 (1),0.7548
                           "PigCVP",  # 104,208,52,2000,0.9183,0.8413 (11),0.8462
                           "PLAID",  # 537,537,11,Vary,0.4786,0.1862 (3),0.1601
                           "PowerCons",  # 180,180,2,144,0.0667,0.0778 (3),0.1222
                           "Rock",  # 20,50,4,2844,0.16, 0.1600 (0),0.4
                           "SemgHandGenderCh2",  # 300,600,2,1500,0.2383,0.1550 (1),0.1983
                           "SemgHandMovementCh2",  # 450,450,6,1500,0.6311,0.3622 (1),0.4156
                           "SemgHandSubjectCh2",  # 450,450,5,1500,0.5956,0.2000 (3),0.2733
                           "ShakeGestureWiimoteZ",  # 50,50,10,Vary,0.4,0.1600 (6),0.14
                           "SmoothSubspace",  # 150,150,3,15,0.0933,0.0533 (1),0.1733
                           "UMD",  # 36,144,3,150,0.2361,0.0278 (6),0.0069
                           ]


def get_classification_datasets_summary(dataset=None, subset="full"):
    if subset == "109":
        if os.path.exists("../data/classification_datasets_109.csv"):
            df = pd.read_csv("../data/classification_datasets_109.csv")
        else:
            df = pd.read_csv(os.getcwd() + "/data/classification_datasets_109.csv")
        df.columns = [x.strip() for x in df.columns]
        if dataset is None:
            return df
    elif subset == "bakeoff":
        if os.path.exists("../data/classification_datasets_bakeoff.csv"):
            df = pd.read_csv("../data/classification_datasets_bakeoff.csv")
        else:
            df = pd.read_csv(os.getcwd() + "/data/classification_datasets_bakeoff.csv")
        df.columns = [x.strip() for x in df.columns]
        if dataset is None:
            return df
    elif subset == "development":
        if os.path.exists("../data/classification_datasets_development.csv"):
            df = pd.read_csv("../data/classification_datasets_development.csv")
        else:
            df = pd.read_csv(os.getcwd() + "/data/classification_datasets_development.csv")
        df.columns = [x.strip() for x in df.columns]
        if dataset is None:
            return df
    elif subset == "holdout":
        if os.path.exists("../data/classification_datasets_development.csv"):
            df_dev = pd.read_csv("../data/classification_datasets_development.csv")
        else:
            df_dev = pd.read_csv(os.getcwd() + "/data/classification_datasets_development.csv")
        if os.path.exists("../data/classification_datasets_bakeoff.csv"):
            df = pd.read_csv("../data/classification_datasets_bakeoff.csv")
        else:
            df = pd.read_csv(os.getcwd() + "/data/classification_datasets_bakeoff.csv")
        df = df.loc[~df["Name"].isin(df_dev["Name"])].reset_index(drop=True)
        df.columns = [x.strip() for x in df.columns]
        if dataset is None:
            return df
    else:
        if os.path.exists("../data/classification_datasets.csv"):
            df = pd.read_csv("../data/classification_datasets.csv")
        else:
            df = pd.read_csv(os.getcwd() + "/data/classification_datasets.csv")
        df.columns = [x.strip() for x in df.columns]
        if dataset is None:
            return df

    return df.loc[df.Name == dataset].reset_index(drop=True)


def read_univariate_ucr(filename, normalise=True):
    if "csv" in filename:
        data = np.loadtxt(filename, delimiter=',')
    else:
        data = np.loadtxt(filename, delimiter='\t')
    Y = data[:, 0]
    X = data[:, 1:]

    scaler = StandardScaler()
    for i in range(len(X)):
        for j in range(len(X[i])):
            if np.isnan(X[i, j]):
                X[i, j] = random.random() / 1000
        # scale it later
        if normalise:
            tmp = scaler.fit_transform(X[i].reshape(-1, 1))
            X[i] = tmp[:, 0]
    X = X.reshape((X.shape[0], X.shape[1], 1))
    return X, Y


def fill_missing(x: np.array,
                 max_len: int,
                 vary_len: str = "suffix-noise",
                 normalise: bool = True):
    if vary_len == "zero":
        if normalise:
            x = StandardScaler().fit_transform(x)
        x = np.nan_to_num(x)
    elif vary_len == 'prefix-suffix-noise':
        for i in range(len(x)):
            series = list()
            for a in x[i, :]:
                if np.isnan(a):
                    break
                series.append(a)
            series = np.array(series)
            seq_len = len(series)
            diff_len = int(0.5 * (max_len - seq_len))

            for j in range(diff_len):
                x[i, j] = random.random() / 1000

            for j in range(diff_len, seq_len):
                x[i, j] = series[j - seq_len]

            for j in range(seq_len, max_len):
                x[i, j] = random.random() / 1000

            if normalise:
                tmp = StandardScaler().fit_transform(x[i].reshape(-1, 1))
                x[i] = tmp[:, 0]
    elif vary_len == 'uniform-scaling':
        for i in range(len(x)):
            series = list()
            for a in x[i, :]:
                if np.isnan(a):
                    break
                series.append(a)
            series = np.array(series)
            seq_len = len(series)

            for j in range(max_len):
                scaling_factor = int(j * seq_len / max_len)
                x[i, j] = series[scaling_factor]
            if normalise:
                tmp = StandardScaler().fit_transform(x[i].reshape(-1, 1))
                x[i] = tmp[:, 0]
    else:
        for i in range(len(x)):
            for j in range(len(x[i])):
                if np.isnan(x[i, j]):
                    x[i, j] = random.random() / 1000

            if normalise:
                tmp = StandardScaler().fit_transform(x[i].reshape(-1, 1))
                x[i] = tmp[:, 0]

    return x


def process_ts_data(X,
                    vary_len: str = "suffix-noise",
                    normalise: bool = False):
    """
    This is a function to process the data, i.e. convert dataframe to numpy array
    :param X:
    :param normalise:
    :return:
    """
    num_instances, num_dim = X.shape
    columns = X.columns
    max_len = np.max([len(X[columns[0]][i]) for i in range(num_instances)])
    output = np.zeros((num_instances, num_dim, max_len), dtype=np.float64)

    for i in range(num_dim):
        for j in range(num_instances):
            output[j, i, :] = X[columns[i]][j].values
        output[:, i, :] = fill_missing(
            output[:, i, :],
            max_len,
            vary_len,
            normalise
        )

    return output