Spaces:

Thang6822
/

SuperAI_Forecast

Running

App Files Files Community

SuperAI_Forecast / libs /chronos-forecasting /scripts /kernel-synth.py

Thang6822

Upload folder using huggingface_hub

7d63c70 verified 18 days ago

raw

history blame contribute delete

5.97 kB

	# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
	# SPDX-License-Identifier: Apache-2.0

	import argparse
	import functools
	from pathlib import Path
	from typing import Optional

	import numpy as np
	from gluonts.dataset.arrow import ArrowWriter
	from joblib import Parallel, delayed
	from sklearn.gaussian_process import GaussianProcessRegressor
	from sklearn.gaussian_process.kernels import (
	RBF,
	ConstantKernel,
	DotProduct,
	ExpSineSquared,
	Kernel,
	RationalQuadratic,
	WhiteKernel,
	)
	from tqdm.auto import tqdm

	LENGTH = 1024
	KERNEL_BANK = [
	ExpSineSquared(periodicity=24 / LENGTH), # H
	ExpSineSquared(periodicity=48 / LENGTH), # 0.5H
	ExpSineSquared(periodicity=96 / LENGTH), # 0.25H
	ExpSineSquared(periodicity=24 * 7 / LENGTH), # H
	ExpSineSquared(periodicity=48 * 7 / LENGTH), # 0.5H
	ExpSineSquared(periodicity=96 * 7 / LENGTH), # 0.25H
	ExpSineSquared(periodicity=7 / LENGTH), # D
	ExpSineSquared(periodicity=14 / LENGTH), # 0.5D
	ExpSineSquared(periodicity=30 / LENGTH), # D
	ExpSineSquared(periodicity=60 / LENGTH), # 0.5D
	ExpSineSquared(periodicity=365 / LENGTH), # D
	ExpSineSquared(periodicity=365 * 2 / LENGTH), # 0.5D
	ExpSineSquared(periodicity=4 / LENGTH), # W
	ExpSineSquared(periodicity=26 / LENGTH), # W
	ExpSineSquared(periodicity=52 / LENGTH), # W
	ExpSineSquared(periodicity=4 / LENGTH), # M
	ExpSineSquared(periodicity=6 / LENGTH), # M
	ExpSineSquared(periodicity=12 / LENGTH), # M
	ExpSineSquared(periodicity=4 / LENGTH), # Q
	ExpSineSquared(periodicity=4 * 10 / LENGTH), # Q
	ExpSineSquared(periodicity=10 / LENGTH), # Y
	DotProduct(sigma_0=0.0),
	DotProduct(sigma_0=1.0),
	DotProduct(sigma_0=10.0),
	RBF(length_scale=0.1),
	RBF(length_scale=1.0),
	RBF(length_scale=10.0),
	RationalQuadratic(alpha=0.1),
	RationalQuadratic(alpha=1.0),
	RationalQuadratic(alpha=10.0),
	WhiteKernel(noise_level=0.1),
	WhiteKernel(noise_level=1.0),
	ConstantKernel(),
	]


	def random_binary_map(a: Kernel, b: Kernel):
	"""
	Applies a random binary operator (+ or *) with equal probability
	on kernels ``a`` and ``b``.

	Parameters
	----------
	a
	A GP kernel.
	b
	A GP kernel.

	Returns
	-------
	The composite kernel `a + b` or `a * b`.
	"""
	binary_maps = [lambda x, y: x + y, lambda x, y: x * y]
	return np.random.choice(binary_maps)(a, b)


	def sample_from_gp_prior(
	kernel: Kernel, X: np.ndarray, random_seed: Optional[int] = None
	):
	"""
	Draw a sample from a GP prior.

	Parameters
	----------
	kernel
	The GP covaraince kernel.
	X
	The input "time" points.
	random_seed, optional
	The random seed for sampling, by default None.

	Returns
	-------
	A time series sampled from the GP prior.
	"""
	if X.ndim == 1:
	X = X[:, None]

	assert X.ndim == 2
	gpr = GaussianProcessRegressor(kernel=kernel)
	ts = gpr.sample_y(X, n_samples=1, random_state=random_seed)

	return ts


	def sample_from_gp_prior_efficient(
	kernel: Kernel,
	X: np.ndarray,
	random_seed: Optional[int] = None,
	method: str = "eigh",
	):
	"""
	Draw a sample from a GP prior. An efficient version that allows specification
	of the sampling method. The default sampling method used in GaussianProcessRegressor
	is based on SVD which is significantly slower that alternatives such as `eigh` and
	`cholesky`.

	Parameters
	----------
	kernel
	The GP covaraince kernel.
	X
	The input "time" points.
	random_seed, optional
	The random seed for sampling, by default None.
	method, optional
	The sampling method for multivariate_normal, by default `eigh`.

	Returns
	-------
	A time series sampled from the GP prior.
	"""
	if X.ndim == 1:
	X = X[:, None]

	assert X.ndim == 2

	cov = kernel(X)
	ts = np.random.default_rng(seed=random_seed).multivariate_normal(
	mean=np.zeros(X.shape[0]), cov=cov, method=method
	)

	return ts


	def generate_time_series(max_kernels: int = 5):
	"""Generate a synthetic time series from KernelSynth.

	Parameters
	----------
	max_kernels, optional
	The maximum number of base kernels to use for each time series, by default 5

	Returns
	-------
	A time series generated by KernelSynth.
	"""
	while True:
	X = np.linspace(0, 1, LENGTH)

	# Randomly select upto max_kernels kernels from the KERNEL_BANK
	selected_kernels = np.random.choice(
	KERNEL_BANK, np.random.randint(1, max_kernels + 1), replace=True
	)

	# Combine the sampled kernels using random binary operators
	kernel = functools.reduce(random_binary_map, selected_kernels)

	# Sample a time series from the GP prior
	try:
	ts = sample_from_gp_prior(kernel=kernel, X=X)
	except np.linalg.LinAlgError as err:
	print("Error caught:", err)
	continue

	# The timestamp is arbitrary
	return {"start": np.datetime64("2000-01-01 00:00", "s"), "target": ts.squeeze()}


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("-N", "--num-series", type=int, default=1000_000)
	parser.add_argument("-J", "--max-kernels", type=int, default=5)
	args = parser.parse_args()
	path = Path(__file__).parent / "kernelsynth-data.arrow"

	generated_dataset = Parallel(n_jobs=-1)(
	delayed(generate_time_series)(max_kernels=args.max_kernels)
	for _ in tqdm(range(args.num_series))
	)

	ArrowWriter(compression="lz4").write_to_file(
	generated_dataset,
	path=path,
	)