Spaces:

GoshawkVortexAI
/

Goshawk_Hedge_Pro

Running

App Files Files Community

Goshawk_Hedge_Pro / labeler.py

GoshawkVortexAI

Create labeler.py

47584e0 verified 14 days ago

raw

history blame contribute delete

5.02 kB

	"""
	labeler.py — Supervised learning target construction for crypto trading.

	Target definition (binary):
	y = 1 if the trade hits RR=1:2 target BEFORE stop within LABEL_FORWARD_BARS
	y = 0 if stop is hit first OR neither hits within the window

	Design decisions:
	- Stop and target computed from ATR at signal bar (no lookahead)
	- Realistic costs (fees + slippage) deducted from target threshold
	- Both long and short labeling supported (direction from rule engine)
	- Time-series integrity: labeling uses only forward prices from bar+1
	- NaN label produced when insufficient forward bars exist (dropped later)

	Target horizon N = 24 bars (1H timeframe = 1 full trading day):
	- Short enough to avoid regime change within the trade
	- Long enough for 1:2 RR to fully play out
	- Empirically: >24 bars introduces too many confounding events
	- <12 bars under-samples legitimate continuation moves
	"""

	import numpy as np
	import pandas as pd
	from typing import Optional

	from ml_config import (
	LABEL_FORWARD_BARS,
	STOP_MULT,
	TARGET_RR,
	ROUND_TRIP_COST,
	)


	def label_single_trade(
	df: pd.DataFrame,
	signal_idx: int,
	atr: float,
	direction: int, # +1 = long, -1 = short
	forward_bars: int = LABEL_FORWARD_BARS,
	) -> Optional[int]:
	"""
	Label a single trade signal.

	Args:
	df: Full OHLCV DataFrame (index = timestamp, sorted ascending)
	signal_idx: Integer position of signal bar in df
	atr: ATR value AT signal bar (must be pre-computed, no lookahead)
	direction: +1 long, -1 short
	forward_bars: Max bars to check

	Returns:
	1 = win (target hit first)
	0 = loss (stop hit first or timeout)
	None = insufficient data
	"""
	if signal_idx + 1 >= len(df):
	return None

	entry_price = float(df["close"].iloc[signal_idx])
	stop_distance = atr * STOP_MULT

	# Cost-adjusted thresholds: we need price to move further than naive RR
	cost_ticks = entry_price * ROUND_TRIP_COST
	target_distance = stop_distance * TARGET_RR + cost_ticks

	if direction == 1: # long
	stop_price = entry_price - stop_distance
	target_price = entry_price + target_distance
	else: # short
	stop_price = entry_price + stop_distance
	target_price = entry_price - target_distance

	end_idx = min(signal_idx + 1 + forward_bars, len(df))
	forward = df.iloc[signal_idx + 1 : end_idx]

	if len(forward) == 0:
	return None

	for _, bar in forward.iterrows():
	high = float(bar["high"])
	low = float(bar["low"])

	if direction == 1:
	# Long: check stop (low) then target (high) — pessimistic ordering
	if low <= stop_price:
	return 0
	if high >= target_price:
	return 1
	else:
	# Short: check stop (high) then target (low)
	if high >= stop_price:
	return 0
	if low <= target_price:
	return 1

	# Neither hit within window = loss (opportunity cost + fees)
	return 0


	def label_dataframe(
	df: pd.DataFrame,
	signal_mask: pd.Series,
	atr_series: pd.Series,
	direction_series: pd.Series,
	forward_bars: int = LABEL_FORWARD_BARS,
	min_bars_remaining: int = LABEL_FORWARD_BARS,
	) -> pd.Series:
	"""
	Label all signal bars in a DataFrame.

	Args:
	df: Full OHLCV DataFrame
	signal_mask: Boolean series, True where a setup was flagged
	atr_series: ATR at each bar (aligned to df index)
	direction_series: +1/-1 for each signal bar
	forward_bars: Max forward window
	min_bars_remaining: Drop labels too close to end of data

	Returns:
	Series of {1, 0, NaN} aligned to df.index
	"""
	labels = pd.Series(np.nan, index=df.index, dtype="float64")
	n = len(df)

	signal_positions = np.where(signal_mask.values)[0]

	for pos in signal_positions:
	# Drop signals too close to end of data (insufficient forward bars)
	if pos + min_bars_remaining >= n:
	continue

	atr_val = float(atr_series.iloc[pos])
	direction = int(direction_series.iloc[pos])

	if np.isnan(atr_val) or direction == 0:
	continue

	label = label_single_trade(df, pos, atr_val, direction, forward_bars)
	if label is not None:
	labels.iloc[pos] = float(label)

	return labels


	def compute_label_stats(labels: pd.Series) -> dict:
	"""Return win rate, class balance, and label counts for diagnostics."""
	valid = labels.dropna()
	total = len(valid)
	wins = int((valid == 1).sum())
	losses = int((valid == 0).sum())
	win_rate = wins / total if total > 0 else 0.0
	class_imbalance = wins / losses if losses > 0 else float("inf")
	return {
	"total_labels": total,
	"wins": wins,
	"losses": losses,
	"win_rate": round(win_rate, 4),
	"class_imbalance_ratio": round(class_imbalance, 3),
	}