""" labeler.py — Supervised learning target construction for crypto trading. Target definition (binary): y = 1 if the trade hits RR=1:2 target BEFORE stop within LABEL_FORWARD_BARS y = 0 if stop is hit first OR neither hits within the window Design decisions: - Stop and target computed from ATR at signal bar (no lookahead) - Realistic costs (fees + slippage) deducted from target threshold - Both long and short labeling supported (direction from rule engine) - Time-series integrity: labeling uses only forward prices from bar+1 - NaN label produced when insufficient forward bars exist (dropped later) Target horizon N = 24 bars (1H timeframe = 1 full trading day): - Short enough to avoid regime change within the trade - Long enough for 1:2 RR to fully play out - Empirically: >24 bars introduces too many confounding events - <12 bars under-samples legitimate continuation moves """ import numpy as np import pandas as pd from typing import Optional from ml_config import ( LABEL_FORWARD_BARS, STOP_MULT, TARGET_RR, ROUND_TRIP_COST, ) def label_single_trade( df: pd.DataFrame, signal_idx: int, atr: float, direction: int, # +1 = long, -1 = short forward_bars: int = LABEL_FORWARD_BARS, ) -> Optional[int]: """ Label a single trade signal. Args: df: Full OHLCV DataFrame (index = timestamp, sorted ascending) signal_idx: Integer position of signal bar in df atr: ATR value AT signal bar (must be pre-computed, no lookahead) direction: +1 long, -1 short forward_bars: Max bars to check Returns: 1 = win (target hit first) 0 = loss (stop hit first or timeout) None = insufficient data """ if signal_idx + 1 >= len(df): return None entry_price = float(df["close"].iloc[signal_idx]) stop_distance = atr * STOP_MULT # Cost-adjusted thresholds: we need price to move further than naive RR cost_ticks = entry_price * ROUND_TRIP_COST target_distance = stop_distance * TARGET_RR + cost_ticks if direction == 1: # long stop_price = entry_price - stop_distance target_price = entry_price + target_distance else: # short stop_price = entry_price + stop_distance target_price = entry_price - target_distance end_idx = min(signal_idx + 1 + forward_bars, len(df)) forward = df.iloc[signal_idx + 1 : end_idx] if len(forward) == 0: return None for _, bar in forward.iterrows(): high = float(bar["high"]) low = float(bar["low"]) if direction == 1: # Long: check stop (low) then target (high) — pessimistic ordering if low <= stop_price: return 0 if high >= target_price: return 1 else: # Short: check stop (high) then target (low) if high >= stop_price: return 0 if low <= target_price: return 1 # Neither hit within window = loss (opportunity cost + fees) return 0 def label_dataframe( df: pd.DataFrame, signal_mask: pd.Series, atr_series: pd.Series, direction_series: pd.Series, forward_bars: int = LABEL_FORWARD_BARS, min_bars_remaining: int = LABEL_FORWARD_BARS, ) -> pd.Series: """ Label all signal bars in a DataFrame. Args: df: Full OHLCV DataFrame signal_mask: Boolean series, True where a setup was flagged atr_series: ATR at each bar (aligned to df index) direction_series: +1/-1 for each signal bar forward_bars: Max forward window min_bars_remaining: Drop labels too close to end of data Returns: Series of {1, 0, NaN} aligned to df.index """ labels = pd.Series(np.nan, index=df.index, dtype="float64") n = len(df) signal_positions = np.where(signal_mask.values)[0] for pos in signal_positions: # Drop signals too close to end of data (insufficient forward bars) if pos + min_bars_remaining >= n: continue atr_val = float(atr_series.iloc[pos]) direction = int(direction_series.iloc[pos]) if np.isnan(atr_val) or direction == 0: continue label = label_single_trade(df, pos, atr_val, direction, forward_bars) if label is not None: labels.iloc[pos] = float(label) return labels def compute_label_stats(labels: pd.Series) -> dict: """Return win rate, class balance, and label counts for diagnostics.""" valid = labels.dropna() total = len(valid) wins = int((valid == 1).sum()) losses = int((valid == 0).sum()) win_rate = wins / total if total > 0 else 0.0 class_imbalance = wins / losses if losses > 0 else float("inf") return { "total_labels": total, "wins": wins, "losses": losses, "win_rate": round(win_rate, 4), "class_imbalance_ratio": round(class_imbalance, 3), }