File size: 5,015 Bytes
47584e0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | """
labeler.py — Supervised learning target construction for crypto trading.
Target definition (binary):
y = 1 if the trade hits RR=1:2 target BEFORE stop within LABEL_FORWARD_BARS
y = 0 if stop is hit first OR neither hits within the window
Design decisions:
- Stop and target computed from ATR at signal bar (no lookahead)
- Realistic costs (fees + slippage) deducted from target threshold
- Both long and short labeling supported (direction from rule engine)
- Time-series integrity: labeling uses only forward prices from bar+1
- NaN label produced when insufficient forward bars exist (dropped later)
Target horizon N = 24 bars (1H timeframe = 1 full trading day):
- Short enough to avoid regime change within the trade
- Long enough for 1:2 RR to fully play out
- Empirically: >24 bars introduces too many confounding events
- <12 bars under-samples legitimate continuation moves
"""
import numpy as np
import pandas as pd
from typing import Optional
from ml_config import (
LABEL_FORWARD_BARS,
STOP_MULT,
TARGET_RR,
ROUND_TRIP_COST,
)
def label_single_trade(
df: pd.DataFrame,
signal_idx: int,
atr: float,
direction: int, # +1 = long, -1 = short
forward_bars: int = LABEL_FORWARD_BARS,
) -> Optional[int]:
"""
Label a single trade signal.
Args:
df: Full OHLCV DataFrame (index = timestamp, sorted ascending)
signal_idx: Integer position of signal bar in df
atr: ATR value AT signal bar (must be pre-computed, no lookahead)
direction: +1 long, -1 short
forward_bars: Max bars to check
Returns:
1 = win (target hit first)
0 = loss (stop hit first or timeout)
None = insufficient data
"""
if signal_idx + 1 >= len(df):
return None
entry_price = float(df["close"].iloc[signal_idx])
stop_distance = atr * STOP_MULT
# Cost-adjusted thresholds: we need price to move further than naive RR
cost_ticks = entry_price * ROUND_TRIP_COST
target_distance = stop_distance * TARGET_RR + cost_ticks
if direction == 1: # long
stop_price = entry_price - stop_distance
target_price = entry_price + target_distance
else: # short
stop_price = entry_price + stop_distance
target_price = entry_price - target_distance
end_idx = min(signal_idx + 1 + forward_bars, len(df))
forward = df.iloc[signal_idx + 1 : end_idx]
if len(forward) == 0:
return None
for _, bar in forward.iterrows():
high = float(bar["high"])
low = float(bar["low"])
if direction == 1:
# Long: check stop (low) then target (high) — pessimistic ordering
if low <= stop_price:
return 0
if high >= target_price:
return 1
else:
# Short: check stop (high) then target (low)
if high >= stop_price:
return 0
if low <= target_price:
return 1
# Neither hit within window = loss (opportunity cost + fees)
return 0
def label_dataframe(
df: pd.DataFrame,
signal_mask: pd.Series,
atr_series: pd.Series,
direction_series: pd.Series,
forward_bars: int = LABEL_FORWARD_BARS,
min_bars_remaining: int = LABEL_FORWARD_BARS,
) -> pd.Series:
"""
Label all signal bars in a DataFrame.
Args:
df: Full OHLCV DataFrame
signal_mask: Boolean series, True where a setup was flagged
atr_series: ATR at each bar (aligned to df index)
direction_series: +1/-1 for each signal bar
forward_bars: Max forward window
min_bars_remaining: Drop labels too close to end of data
Returns:
Series of {1, 0, NaN} aligned to df.index
"""
labels = pd.Series(np.nan, index=df.index, dtype="float64")
n = len(df)
signal_positions = np.where(signal_mask.values)[0]
for pos in signal_positions:
# Drop signals too close to end of data (insufficient forward bars)
if pos + min_bars_remaining >= n:
continue
atr_val = float(atr_series.iloc[pos])
direction = int(direction_series.iloc[pos])
if np.isnan(atr_val) or direction == 0:
continue
label = label_single_trade(df, pos, atr_val, direction, forward_bars)
if label is not None:
labels.iloc[pos] = float(label)
return labels
def compute_label_stats(labels: pd.Series) -> dict:
"""Return win rate, class balance, and label counts for diagnostics."""
valid = labels.dropna()
total = len(valid)
wins = int((valid == 1).sum())
losses = int((valid == 0).sum())
win_rate = wins / total if total > 0 else 0.0
class_imbalance = wins / losses if losses > 0 else float("inf")
return {
"total_labels": total,
"wins": wins,
"losses": losses,
"win_rate": round(win_rate, 4),
"class_imbalance_ratio": round(class_imbalance, 3),
}
|