Spaces:

akshayboora
/

AFML

No application file

App Files Files Community

AFML / afml /features /returns.py

akshayboora

Upload 940 files

669d6a1 verified 26 days ago

raw

history blame contribute delete

6.74 kB

	from typing import Union

	import numpy as np
	import pandas as pd
	from numba import njit, prange


	def get_period_returns(close: pd.Series, **time_delta_kwargs) -> pd.Series:
	"""
	Compute periodic returns for a given time period, robust to non-consecutive trading days.

	This function calculates returns by finding the closing price from a specified
	time duration (days, hours, minutes) in the past. It handles cases where
	the prior period might not be a trading day by using `searchsorted` to find
	the nearest valid previous index.

	:param close: (pd.Series) closing prices, indexed by datetime
	:param time_delta_kwargs: Time components for calculating period returns:
	- days: (int) Number of days
	- hours: (int) Number of hours
	- minutes: (int) Number of minutes
	- seconds: (int) Number of seconds
	return: (pd.Series) Periodic returns (percentage changes), aligned to the prior valid trading period
	"""
	# Find previous valid trading day for each date
	prev_idx = close.index.searchsorted(close.index - pd.Timedelta(**time_delta_kwargs))

	# Drop indices that are before the start of the 'close' Series
	prev_idx = prev_idx[prev_idx > 0]

	# Align current and previous closes
	curr_idx = close.index[close.shape[0] - prev_idx.shape[0] :]
	prev_close = close.iloc[prev_idx - 1].values

	ret = close.loc[curr_idx] / prev_close - 1
	return ret


	@njit(parallel=True, cache=True)
	def rolling_autocorr_numba(data: np.ndarray, lookback: int) -> np.ndarray:
	"""
	Computes rolling autocorrelation for a 1D NumPy array using Numba for performance.

	This function calculates the autocorrelation between `data[t]` and `data[t-1]`
	within a rolling window of `lookback` size. It leverages Numba's `njit` and
	`prange` for parallel execution, making it efficient for large datasets.

	Args:
	data: A 1D NumPy array of numerical data (e.g., returns).
	lookback: The size of the rolling window for autocorrelation calculation.

	Returns:
	A NumPy array containing the rolling autocorrelation values.
	The initial `lookback - 1` values will be NaN as there isn't enough data.
	"""
	result = np.full(len(data), np.nan)
	for i in prange(lookback - 1, len(data)):
	window = data[i - lookback + 1 : i + 1]
	# [0, 1] extracts the correlation between the two series (not self-correlation)
	result[i] = np.corrcoef(window[:-1], window[1:])[0, 1]
	return result


	def get_period_autocorr(
	close: pd.Series, lookback: int = 100, **time_delta_kwargs
	) -> pd.Series:
	"""
	Estimates rolling periodic autocorrelation of closing prices.

	This function first calculates the periodic returns using `get_period_returns`
	and then computes the rolling autocorrelation of these returns using the
	Numba-optimized `rolling_autocorr_numba` function.

	:param close: (pd.Series) closing prices, indexed by datetime
	:param lookback: (int) The window equivalent of the Simple Moving Average for the Exponentially Weighted Moving
	average calculation (default is 100)
	:param time_delta_kwargs: Time components for calculating period returns:
	- days: (int) Number of days
	- hours: (int) Number of hours
	- minutes: (int) Number of minutes
	- seconds: (int) Number of seconds
	return: (pd.Series) of rolling periodic autocorrelation values, indexed by the datetime index of the input `close` Series.
	"""
	ret = get_period_returns(close, **time_delta_kwargs)
	acorr = rolling_autocorr_numba(ret.to_numpy(), lookback)
	df0 = pd.Series(acorr, index=ret.index)
	return df0


	def get_lagged_returns(
	prices: Union[pd.Series, pd.DataFrame],
	lags: list,
	nperiods: int = 3,
	) -> pd.DataFrame:
	"""
	Generates a DataFrame of various lagged returns and optionally forward target returns.

	This function calculates returns for specified lag periods, clips extreme
	values based on quantiles, and then creates additional lagged features
	(e.g., `returns_X_lag_Y`). It can also generate forward returns
	as a target variable.

	Args:
	prices: A pandas Series or DataFrame of close prices. If a Series, it's
	treated as a single instrument. If a DataFrame, each column
	represents a different instrument or asset. The index should
	be datetime-based.
	lags: A list of integers, where each integer represents a lag period
	for which returns should be calculated (e.g., `[1, 5, 20]` for
	daily, weekly, and monthly returns).
	nperiods: The number of additional lagged versions to create for each
	return series. For example, if `nperiods=3` and `lags=[1]`,
	it will create `returns_1_lag_1`, `returns_1_lag_2`,
	`returns_1_lag_3`. Defaults to 3.

	Returns:
	A pandas DataFrame containing the calculated returns and their lagged versions.
	If `target` is True, it will also include forward target returns.
	"""
	q = 0.0001 # Quantile cut-off for winsorizing extreme prices
	df = pd.DataFrame()

	price_columns = (
	[(prices.name or "price", prices)]
	if isinstance(prices, pd.Series)
	else [(col, prices[col]) for col in prices.columns]
	)

	for col_name, price_series in price_columns:
	prefix = "" if len(price_columns) == 1 else f"{col_name}_"
	for lag in lags:
	# Calculate 1-period geometric mean return of the lag period and
	# winsorize extreme values by clipping.
	returns = price_series.pct_change(lag)
	returns = returns.clip(lower=returns.quantile(q), upper=returns.quantile(1 - q))
	df[f"{prefix}returns_{lag}"] = returns.add(1).pow(1 / lag).sub(1)

	# Create additional lagged versions of the calculated returns
	for t in range(1, nperiods + 1):
	for col_name, _ in price_columns:
	prefix = "" if len(price_columns) == 1 else f"{col_name}_"
	for lag in lags:
	df[f"{prefix}returns_{lag}_lag_{t}"] = df[
	f"{prefix}returns_{lag}"
	].shift(t * lag)

	df.rename(columns={"returns_1": "returns"}, inplace=True)
	return df


	def get_return_dist_features(close, window=10):
	"""Distribution of log-return features"""
	df = pd.DataFrame(index=close.index)
	ret = np.log(close).diff()
	sma_returns = ret.rolling(window, min_periods=3)
	df["returns_norm"] = (ret - sma_returns.mean()) / sma_returns.std()
	df[f"returns_skew"] = sma_returns.skew()
	df[f"returns_kurt"] = sma_returns.kurt()
	return df