| | """ |
| | labeler.py — Supervised learning target construction for crypto trading. |
| | |
| | Target definition (binary): |
| | y = 1 if the trade hits RR=1:2 target BEFORE stop within LABEL_FORWARD_BARS |
| | y = 0 if stop is hit first OR neither hits within the window |
| | |
| | Design decisions: |
| | - Stop and target computed from ATR at signal bar (no lookahead) |
| | - Realistic costs (fees + slippage) deducted from target threshold |
| | - Both long and short labeling supported (direction from rule engine) |
| | - Time-series integrity: labeling uses only forward prices from bar+1 |
| | - NaN label produced when insufficient forward bars exist (dropped later) |
| | |
| | Target horizon N = 24 bars (1H timeframe = 1 full trading day): |
| | - Short enough to avoid regime change within the trade |
| | - Long enough for 1:2 RR to fully play out |
| | - Empirically: >24 bars introduces too many confounding events |
| | - <12 bars under-samples legitimate continuation moves |
| | """ |
| |
|
| | import numpy as np |
| | import pandas as pd |
| | from typing import Optional |
| |
|
| | from ml_config import ( |
| | LABEL_FORWARD_BARS, |
| | STOP_MULT, |
| | TARGET_RR, |
| | ROUND_TRIP_COST, |
| | ) |
| |
|
| |
|
| | def label_single_trade( |
| | df: pd.DataFrame, |
| | signal_idx: int, |
| | atr: float, |
| | direction: int, |
| | forward_bars: int = LABEL_FORWARD_BARS, |
| | ) -> Optional[int]: |
| | """ |
| | Label a single trade signal. |
| | |
| | Args: |
| | df: Full OHLCV DataFrame (index = timestamp, sorted ascending) |
| | signal_idx: Integer position of signal bar in df |
| | atr: ATR value AT signal bar (must be pre-computed, no lookahead) |
| | direction: +1 long, -1 short |
| | forward_bars: Max bars to check |
| | |
| | Returns: |
| | 1 = win (target hit first) |
| | 0 = loss (stop hit first or timeout) |
| | None = insufficient data |
| | """ |
| | if signal_idx + 1 >= len(df): |
| | return None |
| |
|
| | entry_price = float(df["close"].iloc[signal_idx]) |
| | stop_distance = atr * STOP_MULT |
| |
|
| | |
| | cost_ticks = entry_price * ROUND_TRIP_COST |
| | target_distance = stop_distance * TARGET_RR + cost_ticks |
| |
|
| | if direction == 1: |
| | stop_price = entry_price - stop_distance |
| | target_price = entry_price + target_distance |
| | else: |
| | stop_price = entry_price + stop_distance |
| | target_price = entry_price - target_distance |
| |
|
| | end_idx = min(signal_idx + 1 + forward_bars, len(df)) |
| | forward = df.iloc[signal_idx + 1 : end_idx] |
| |
|
| | if len(forward) == 0: |
| | return None |
| |
|
| | for _, bar in forward.iterrows(): |
| | high = float(bar["high"]) |
| | low = float(bar["low"]) |
| |
|
| | if direction == 1: |
| | |
| | if low <= stop_price: |
| | return 0 |
| | if high >= target_price: |
| | return 1 |
| | else: |
| | |
| | if high >= stop_price: |
| | return 0 |
| | if low <= target_price: |
| | return 1 |
| |
|
| | |
| | return 0 |
| |
|
| |
|
| | def label_dataframe( |
| | df: pd.DataFrame, |
| | signal_mask: pd.Series, |
| | atr_series: pd.Series, |
| | direction_series: pd.Series, |
| | forward_bars: int = LABEL_FORWARD_BARS, |
| | min_bars_remaining: int = LABEL_FORWARD_BARS, |
| | ) -> pd.Series: |
| | """ |
| | Label all signal bars in a DataFrame. |
| | |
| | Args: |
| | df: Full OHLCV DataFrame |
| | signal_mask: Boolean series, True where a setup was flagged |
| | atr_series: ATR at each bar (aligned to df index) |
| | direction_series: +1/-1 for each signal bar |
| | forward_bars: Max forward window |
| | min_bars_remaining: Drop labels too close to end of data |
| | |
| | Returns: |
| | Series of {1, 0, NaN} aligned to df.index |
| | """ |
| | labels = pd.Series(np.nan, index=df.index, dtype="float64") |
| | n = len(df) |
| |
|
| | signal_positions = np.where(signal_mask.values)[0] |
| |
|
| | for pos in signal_positions: |
| | |
| | if pos + min_bars_remaining >= n: |
| | continue |
| |
|
| | atr_val = float(atr_series.iloc[pos]) |
| | direction = int(direction_series.iloc[pos]) |
| |
|
| | if np.isnan(atr_val) or direction == 0: |
| | continue |
| |
|
| | label = label_single_trade(df, pos, atr_val, direction, forward_bars) |
| | if label is not None: |
| | labels.iloc[pos] = float(label) |
| |
|
| | return labels |
| |
|
| |
|
| | def compute_label_stats(labels: pd.Series) -> dict: |
| | """Return win rate, class balance, and label counts for diagnostics.""" |
| | valid = labels.dropna() |
| | total = len(valid) |
| | wins = int((valid == 1).sum()) |
| | losses = int((valid == 0).sum()) |
| | win_rate = wins / total if total > 0 else 0.0 |
| | class_imbalance = wins / losses if losses > 0 else float("inf") |
| | return { |
| | "total_labels": total, |
| | "wins": wins, |
| | "losses": losses, |
| | "win_rate": round(win_rate, 4), |
| | "class_imbalance_ratio": round(class_imbalance, 3), |
| | } |
| |
|