added all the files

556d303 verified 2 days ago

17.8 kB

	# Chang Wei Tan, Angus Dempster, Christoph Bergmeir, Geoffrey I Webb
	#
	# MultiRocket: Multiple pooling operators and transformations for fast and effective time series classification
	# https://arxiv.org/abs/2102.00457

	import os
	import random

	import numpy as np
	import pandas as pd
	from sklearn.preprocessing import StandardScaler

	non_109_datasets = ["HandOutlines",
	"NonInvasiveFetalECGThorax1",
	"NonInvasiveFetalECGThorax2",
	"AllGestureWiimoteX",
	"AllGestureWiimoteY",
	"AllGestureWiimoteZ",
	"DodgerLoopDay",
	"DodgerLoopGame",
	"DodgerLoopWeekend",
	"Fungi",
	"GestureMidAirD1",
	"GestureMidAirD2",
	"GestureMidAirD3",
	"GesturePebbleZ1",
	"GesturePebbleZ2",
	"MelbournePedestrian",
	"PickupGestureWiimoteZ",
	"PLAID",
	"ShakeGestureWiimoteZ"]

	classification_datasets = ["Adiac", # 390,391,37,176,0.3887,0.3913 (3),0.3964
	"ArrowHead", # 36,175,3,251,0.2,0.2000 (0),0.2971
	"Beef", # 30,30,5,470,0.3333,0.3333 (0),0.3667
	"BeetleFly", # 20,20,2,512,0.25,0.3000 (7),0.3
	"BirdChicken", # 20,20,2,512,0.45,0.3000 (6),0.25
	"Car", # 60,60,4,577,0.2667,0.2333 (1),0.2667
	"CBF", # 30,900,3,128,0.1478,0.0044 (11),0.0033
	"ChlorineConcentration", # 467,3840,3,166,0.35,0.3500 (0),0.3516
	"CinCECGTorso", # 40,1380,4,1639,0.1029,0.0696 (1),0.3493
	"Coffee", # 28,28,2,286,0,0.0000 (0),0
	"Computers", # 250,250,2,720,0.424,0.3800 (12),0.3
	"CricketX", # 390,390,12,300,0.4231,0.2282 (10),0.2462
	"CricketY", # 390,390,12,300,0.4333,0.2410 (17),0.2564
	"CricketZ", # 390,390,12,300,0.4128,0.2538 (5),0.2462
	"DiatomSizeReduction", # 16,306,4,345,0.0654,0.0654 (0),0.0327
	"DistalPhalanxOutlineAgeGroup", # 400,139,3,80,0.3741,0.3741 (0),0.2302
	"DistalPhalanxOutlineCorrect", # 600,276,2,80,0.2826,0.2754 (1),0.2826
	"DistalPhalanxTW", # 400,139,6,80,0.3669,0.3669 (0),0.4101
	"Earthquakes", # 322,139,2,512,0.2878,0.2734 (6),0.2806
	"ECG200", # 100,100,2,96,0.12,0.1200 (0),0.23
	"ECG5000", # 500,4500,5,140,0.0751,0.0749 (1),0.0756
	"ECGFiveDays", # 23,861,2,136,0.2033,0.2033 (0),0.2323
	"ElectricDevices", # 8926,7711,7,96,0.4492,0.3806 (14),0.3988
	"FaceAll", # 560,1690,14,131,0.2864,0.1917 (3),0.1923
	"FaceFour", # 24,88,4,350,0.2159,0.1136 (2),0.1705
	"FacesUCR", # 200,2050,14,131,0.2307,0.0878 (12),0.0951
	"FiftyWords", # 450,455,50,270,0.3692,0.2418 (6),0.3099
	"Fish", # 175,175,7,463,0.2171,0.1543 (4),0.1771
	"FordA", # 3601,1320,2,500,0.3348,0.3091 (1),0.4455
	"FordB", # 3636,810,2,500,0.3938,0.3926 (1),0.3802
	"GunPoint", # 50,150,2,150,0.0867,0.0867 (0) ,0.0933
	"Ham", # 109,105,2,431,0.4,0.4000 (0),0.5333
	"HandOutlines", # 1000,370,2,2709,0.1378,0.1378 (0),0.1189
	"Haptics", # 155,308,5,1092,0.6299,0.5877 (2),0.6234
	"Herring", # 64,64,2,512,0.4844,0.4688 (5),0.4688
	"InlineSkate", # 100,550,7,1882,0.6582,0.6127 (14),0.6164
	"InsectWingbeatSound", # 220,1980,11,256,0.4384,0.4152 (1),0.6449
	"ItalyPowerDemand", # 67,1029,2,24,0.0447,0.0447 (0),0.0496
	"LargeKitchenAppliances", # 375,375,3,720,0.5067,0.2053 (94),0.2053
	"Lightning2", # 60,61,2,637,0.2459,0.1311 (6),0.1311
	"Lightning7", # 70,73,7,319,0.4247,0.2877 (5),0.274
	"Mallat", # 55,2345,8,1024,0.0857,0.0857 (0),0.0661
	"Meat", # 60,60,3,448,0.0667,0.0667 (0),0.0667
	"MedicalImages", # 381,760,10,99,0.3158,0.2526 (20),0.2632
	"MiddlePhalanxOutlineAgeGroup", # 400,154,3,80,0.4805,0.4805 (0),0.5
	"MiddlePhalanxOutlineCorrect", # 600,291,2,80,0.2337,0.2337 (0),0.3024
	"MiddlePhalanxTW", # 399,154,6,80,0.487,0.4935 (3),0.4935
	"MoteStrain", # 20,1252,2,84,0.1214,0.1342 (1),0.1653
	"NonInvasiveFetalECGThorax1", # 1800,1965,42,750,0.171,0.1893 (1),0.2097
	"NonInvasiveFetalECGThorax2", # 1800,1965,42,750,0.1201,0.1290 (1),0.1354
	"OliveOil", # 30,30,4,570,0.1333,0.1333 (0),0.1667
	"OSULeaf", # 200,242,6,427,0.4793,0.3884 (7),0.4091
	"PhalangesOutlinesCorrect", # 1800,858,2,80,0.2389,0.2389 (0),0.2716
	"Phoneme", # 214,1896,39,1024,0.8908,0.7727 (14),0.7716
	"Plane", # 105,105,7,144,0.0381,0.0000 (5),0
	"ProximalPhalanxOutlineAgeGroup", # 400,205,3,80,0.2146,0.2146 (0),0.1951
	"ProximalPhalanxOutlineCorrect", # 600,291,2,80,0.1924,0.2096 (1),0.2165
	"ProximalPhalanxTW", # 400,205,6,80,0.2927,0.2439 (2),0.2439
	"RefrigerationDevices", # 375,375,3,720,0.6053,0.5600 (8),0.536
	"ScreenType", # 375,375,3,720,0.64,0.5893 (17),0.6027
	"ShapeletSim", # 20,180,2,500,0.4611,0.3000 (3),0.35
	"ShapesAll", # 600,600,60,512,0.2483,0.1980 (4),0.2317
	"SmallKitchenAppliances", # 375,375,3,720,0.6587,0.3280 (15),0.3573
	"SonyAIBORobotSurface1", # 20,601,2,70,0.3045,0.3045 (0),0.2745
	"SonyAIBORobotSurface2", # 27,953,2,65,0.1406,0.1406 (0),0.1689
	"StarLightCurves", # 1000,8236,3,1024,0.1512,0.0947 (16),0.0934
	"Strawberry", # 613,370,2,235,0.0541,0.0541 (0),0.0595
	"SwedishLeaf", # 500,625,15,128,0.2112,0.1536 (2),0.208
	"Symbols", # 25,995,6,398,0.1005,0.0623 (8),0.0503
	"SyntheticControl", # 300,300,6,60,0.12,0.0167 (6),0.0067
	"ToeSegmentation1", # 40,228,2,277,0.3202,0.2500 (8),0.2281
	"ToeSegmentation2", # 36,130,2,343,0.1923,0.0923 (5),0.1615
	"Trace", # 100,100,4,275,0.24,0.0100 (3),0
	"TwoLeadECG", # 23,1139,2,82,0.2529,0.1317 (4),0.0957
	"TwoPatterns", # 1000,4000,4,128,0.0932,0.0015 (4),0
	"UWaveGestureLibraryAll", # 896,3582,8,945,0.0519,0.0343 (4),0.1083
	"UWaveGestureLibraryX", # 896,3582,8,315,0.2607,0.2267 (4),0.2725
	"UWaveGestureLibraryY", # 896,3582,8,315,0.338,0.3009 (4),0.366
	"UWaveGestureLibraryZ", # 896,3582,8,315,0.3504,0.3222 (6),0.3417
	"Wafer", # 1000,6164,2,152,0.0045,0.0045 (1),0.0201
	"Wine", # 57,54,2,234,0.3889,0.3889 (0),0.4259
	"WordSynonyms", # 267,638,25,270,0.3824,0.2618 (9),0.3511
	"Worms", # 181,77,5,900,0.5455,0.4675 (9),0.4156
	"WormsTwoClass", # 181,77,2,900,0.3896,0.4156 (7),0.3766
	"Yoga", # 300,3000,2,426,0.1697,0.1560 (7),0.1637
	"ACSF1", # 100,100,10,1460,0.46,0.3800 (4),0.36
	"AllGestureWiimoteX", # 300,700,10,Vary,0.4843, 0.2829 (14),0.2843
	"AllGestureWiimoteY", # 300,700,10,Vary,0.4314, 0.2700 (9),0.2714
	"AllGestureWiimoteZ", # 300,700,10,Vary,0.5457,0.3486 (11),0.3571
	"BME", # 30,150,3,128,0.1667,0.0200 (4),0.1
	"Chinatown", # 20,345,2,24,0.0464,0.0464 (0),0.0435
	"Crop", # 7200,16800,24,46,0.2883,0.2883 (0),0.3348
	"DodgerLoopDay", # 78,80,7,288,0.45, 0.4125 (1),0.5
	"DodgerLoopGame", # 20,138,2,288,0.1159, 0.0725 (1),0.1232
	"DodgerLoopWeekend", # 20,138,2,288,0.0145, 0.0217 (1),0.0507
	"EOGHorizontalSignal", # 362,362,12,1250,0.5829, 0.5249 (1),0.4972
	"EOGVerticalSignal", # 362,362,12,1250,0.558, 0.5249 (2),0.5525
	"EthanolLevel", # 504,500,4,1751,0.726,0.7180 (1),0.724
	"FreezerRegularTrain", # 150,2850,2,301,0.1951,0.0930 (1),0.1011
	"FreezerSmallTrain", # 28,2850,2,301,0.3302,0.3302 (0),0.2467
	"Fungi", # 18,186,18,201,0.1774,0.1774 (0),0.1613
	"GestureMidAirD1", # 208,130,26,Vary,0.4231, 0.3615 (5),0.4308
	"GestureMidAirD2", # 208,130,26,Vary,0.5077, 0.4000 (6),0.3923
	"GestureMidAirD3", # 208,130,26,Vary,0.6538, 0.6231 (1),0.6769
	"GesturePebbleZ1", # 132,172,6,Vary,0.2674,0.1744 (2),0.2093
	"GesturePebbleZ2", # 146,158,6,Vary,0.3291,0.2215 (6),0.3291
	"GunPointAgeSpan", # 135,316,2,150,0.1013,0.0348 (3),0.0823
	"GunPointMaleVersusFemale", # 135,316,2,150,0.0253,0.0253 (0),0.0032
	"GunPointOldVersusYoung", # 135,316,2,150,0.0476,0.0349 (4),0.1619
	"HouseTwenty", # 40,119,2,2000,0.3361, 0.0588 (33),0.0756
	"InsectEPGRegularTrain", # 62,249,3,601,0.3213,0.1727 (11),0.1285
	"InsectEPGSmallTrain", # 17,249,3,601,0.3373,0.3052 (1),0.2651
	"MelbournePedestrian", # 1200,2450,10,24,0.1518,0.1518 (0),0.2094
	"MixedShapesRegularTrain", # 500,2425,5,1024,0.1027, 0.0911 (4),0.1584
	"MixedShapesSmallTrain", # 100,2425,5,1024,0.1645, 0.1674 (7),0.2202
	"PickupGestureWiimoteZ", # 50,50,10,Vary,0.44,0.3400 (17),0.34
	"PigAirwayPressure", # 104,208,52,2000,0.9423,0.9038 (1),0.8942
	"PigArtPressure", # 104,208,52,2000,0.875,0.8029 (1),0.7548
	"PigCVP", # 104,208,52,2000,0.9183,0.8413 (11),0.8462
	"PLAID", # 537,537,11,Vary,0.4786,0.1862 (3),0.1601
	"PowerCons", # 180,180,2,144,0.0667,0.0778 (3),0.1222
	"Rock", # 20,50,4,2844,0.16, 0.1600 (0),0.4
	"SemgHandGenderCh2", # 300,600,2,1500,0.2383,0.1550 (1),0.1983
	"SemgHandMovementCh2", # 450,450,6,1500,0.6311,0.3622 (1),0.4156
	"SemgHandSubjectCh2", # 450,450,5,1500,0.5956,0.2000 (3),0.2733
	"ShakeGestureWiimoteZ", # 50,50,10,Vary,0.4,0.1600 (6),0.14
	"SmoothSubspace", # 150,150,3,15,0.0933,0.0533 (1),0.1733
	"UMD", # 36,144,3,150,0.2361,0.0278 (6),0.0069
	]


	def get_classification_datasets_summary(dataset=None, subset="full"):
	if subset == "109":
	if os.path.exists("../data/classification_datasets_109.csv"):
	df = pd.read_csv("../data/classification_datasets_109.csv")
	else:
	df = pd.read_csv(os.getcwd() + "/data/classification_datasets_109.csv")
	df.columns = [x.strip() for x in df.columns]
	if dataset is None:
	return df
	elif subset == "bakeoff":
	if os.path.exists("../data/classification_datasets_bakeoff.csv"):
	df = pd.read_csv("../data/classification_datasets_bakeoff.csv")
	else:
	df = pd.read_csv(os.getcwd() + "/data/classification_datasets_bakeoff.csv")
	df.columns = [x.strip() for x in df.columns]
	if dataset is None:
	return df
	elif subset == "development":
	if os.path.exists("../data/classification_datasets_development.csv"):
	df = pd.read_csv("../data/classification_datasets_development.csv")
	else:
	df = pd.read_csv(os.getcwd() + "/data/classification_datasets_development.csv")
	df.columns = [x.strip() for x in df.columns]
	if dataset is None:
	return df
	elif subset == "holdout":
	if os.path.exists("../data/classification_datasets_development.csv"):
	df_dev = pd.read_csv("../data/classification_datasets_development.csv")
	else:
	df_dev = pd.read_csv(os.getcwd() + "/data/classification_datasets_development.csv")
	if os.path.exists("../data/classification_datasets_bakeoff.csv"):
	df = pd.read_csv("../data/classification_datasets_bakeoff.csv")
	else:
	df = pd.read_csv(os.getcwd() + "/data/classification_datasets_bakeoff.csv")
	df = df.loc[~df["Name"].isin(df_dev["Name"])].reset_index(drop=True)
	df.columns = [x.strip() for x in df.columns]
	if dataset is None:
	return df
	else:
	if os.path.exists("../data/classification_datasets.csv"):
	df = pd.read_csv("../data/classification_datasets.csv")
	else:
	df = pd.read_csv(os.getcwd() + "/data/classification_datasets.csv")
	df.columns = [x.strip() for x in df.columns]
	if dataset is None:
	return df

	return df.loc[df.Name == dataset].reset_index(drop=True)


	def read_univariate_ucr(filename, normalise=True):
	if "csv" in filename:
	data = np.loadtxt(filename, delimiter=',')
	else:
	data = np.loadtxt(filename, delimiter='\t')
	Y = data[:, 0]
	X = data[:, 1:]

	scaler = StandardScaler()
	for i in range(len(X)):
	for j in range(len(X[i])):
	if np.isnan(X[i, j]):
	X[i, j] = random.random() / 1000
	# scale it later
	if normalise:
	tmp = scaler.fit_transform(X[i].reshape(-1, 1))
	X[i] = tmp[:, 0]
	X = X.reshape((X.shape[0], X.shape[1], 1))
	return X, Y


	def fill_missing(x: np.array,
	max_len: int,
	vary_len: str = "suffix-noise",
	normalise: bool = True):
	if vary_len == "zero":
	if normalise:
	x = StandardScaler().fit_transform(x)
	x = np.nan_to_num(x)
	elif vary_len == 'prefix-suffix-noise':
	for i in range(len(x)):
	series = list()
	for a in x[i, :]:
	if np.isnan(a):
	break
	series.append(a)
	series = np.array(series)
	seq_len = len(series)
	diff_len = int(0.5 * (max_len - seq_len))

	for j in range(diff_len):
	x[i, j] = random.random() / 1000

	for j in range(diff_len, seq_len):
	x[i, j] = series[j - seq_len]

	for j in range(seq_len, max_len):
	x[i, j] = random.random() / 1000

	if normalise:
	tmp = StandardScaler().fit_transform(x[i].reshape(-1, 1))
	x[i] = tmp[:, 0]
	elif vary_len == 'uniform-scaling':
	for i in range(len(x)):
	series = list()
	for a in x[i, :]:
	if np.isnan(a):
	break
	series.append(a)
	series = np.array(series)
	seq_len = len(series)

	for j in range(max_len):
	scaling_factor = int(j * seq_len / max_len)
	x[i, j] = series[scaling_factor]
	if normalise:
	tmp = StandardScaler().fit_transform(x[i].reshape(-1, 1))
	x[i] = tmp[:, 0]
	else:
	for i in range(len(x)):
	for j in range(len(x[i])):
	if np.isnan(x[i, j]):
	x[i, j] = random.random() / 1000

	if normalise:
	tmp = StandardScaler().fit_transform(x[i].reshape(-1, 1))
	x[i] = tmp[:, 0]

	return x


	def process_ts_data(X,
	vary_len: str = "suffix-noise",
	normalise: bool = False):
	"""
	This is a function to process the data, i.e. convert dataframe to numpy array
	:param X:
	:param normalise:
	:return:
	"""
	num_instances, num_dim = X.shape
	columns = X.columns
	max_len = np.max([len(X[columns[0]][i]) for i in range(num_instances)])
	output = np.zeros((num_instances, num_dim, max_len), dtype=np.float64)

	for i in range(num_dim):
	for j in range(num_instances):
	output[j, i, :] = X[columns[i]][j].values
	output[:, i, :] = fill_missing(
	output[:, i, :],
	max_len,
	vary_len,
	normalise
	)

	return output