Goshawk_Hedge_Pro / labeler.py
GoshawkVortexAI's picture
Create labeler.py
47584e0 verified
"""
labeler.py — Supervised learning target construction for crypto trading.
Target definition (binary):
y = 1 if the trade hits RR=1:2 target BEFORE stop within LABEL_FORWARD_BARS
y = 0 if stop is hit first OR neither hits within the window
Design decisions:
- Stop and target computed from ATR at signal bar (no lookahead)
- Realistic costs (fees + slippage) deducted from target threshold
- Both long and short labeling supported (direction from rule engine)
- Time-series integrity: labeling uses only forward prices from bar+1
- NaN label produced when insufficient forward bars exist (dropped later)
Target horizon N = 24 bars (1H timeframe = 1 full trading day):
- Short enough to avoid regime change within the trade
- Long enough for 1:2 RR to fully play out
- Empirically: >24 bars introduces too many confounding events
- <12 bars under-samples legitimate continuation moves
"""
import numpy as np
import pandas as pd
from typing import Optional
from ml_config import (
LABEL_FORWARD_BARS,
STOP_MULT,
TARGET_RR,
ROUND_TRIP_COST,
)
def label_single_trade(
df: pd.DataFrame,
signal_idx: int,
atr: float,
direction: int, # +1 = long, -1 = short
forward_bars: int = LABEL_FORWARD_BARS,
) -> Optional[int]:
"""
Label a single trade signal.
Args:
df: Full OHLCV DataFrame (index = timestamp, sorted ascending)
signal_idx: Integer position of signal bar in df
atr: ATR value AT signal bar (must be pre-computed, no lookahead)
direction: +1 long, -1 short
forward_bars: Max bars to check
Returns:
1 = win (target hit first)
0 = loss (stop hit first or timeout)
None = insufficient data
"""
if signal_idx + 1 >= len(df):
return None
entry_price = float(df["close"].iloc[signal_idx])
stop_distance = atr * STOP_MULT
# Cost-adjusted thresholds: we need price to move further than naive RR
cost_ticks = entry_price * ROUND_TRIP_COST
target_distance = stop_distance * TARGET_RR + cost_ticks
if direction == 1: # long
stop_price = entry_price - stop_distance
target_price = entry_price + target_distance
else: # short
stop_price = entry_price + stop_distance
target_price = entry_price - target_distance
end_idx = min(signal_idx + 1 + forward_bars, len(df))
forward = df.iloc[signal_idx + 1 : end_idx]
if len(forward) == 0:
return None
for _, bar in forward.iterrows():
high = float(bar["high"])
low = float(bar["low"])
if direction == 1:
# Long: check stop (low) then target (high) — pessimistic ordering
if low <= stop_price:
return 0
if high >= target_price:
return 1
else:
# Short: check stop (high) then target (low)
if high >= stop_price:
return 0
if low <= target_price:
return 1
# Neither hit within window = loss (opportunity cost + fees)
return 0
def label_dataframe(
df: pd.DataFrame,
signal_mask: pd.Series,
atr_series: pd.Series,
direction_series: pd.Series,
forward_bars: int = LABEL_FORWARD_BARS,
min_bars_remaining: int = LABEL_FORWARD_BARS,
) -> pd.Series:
"""
Label all signal bars in a DataFrame.
Args:
df: Full OHLCV DataFrame
signal_mask: Boolean series, True where a setup was flagged
atr_series: ATR at each bar (aligned to df index)
direction_series: +1/-1 for each signal bar
forward_bars: Max forward window
min_bars_remaining: Drop labels too close to end of data
Returns:
Series of {1, 0, NaN} aligned to df.index
"""
labels = pd.Series(np.nan, index=df.index, dtype="float64")
n = len(df)
signal_positions = np.where(signal_mask.values)[0]
for pos in signal_positions:
# Drop signals too close to end of data (insufficient forward bars)
if pos + min_bars_remaining >= n:
continue
atr_val = float(atr_series.iloc[pos])
direction = int(direction_series.iloc[pos])
if np.isnan(atr_val) or direction == 0:
continue
label = label_single_trade(df, pos, atr_val, direction, forward_bars)
if label is not None:
labels.iloc[pos] = float(label)
return labels
def compute_label_stats(labels: pd.Series) -> dict:
"""Return win rate, class balance, and label counts for diagnostics."""
valid = labels.dropna()
total = len(valid)
wins = int((valid == 1).sum())
losses = int((valid == 0).sum())
win_rate = wins / total if total > 0 else 0.0
class_imbalance = wins / losses if losses > 0 else float("inf")
return {
"total_labels": total,
"wins": wins,
"losses": losses,
"win_rate": round(win_rate, 4),
"class_imbalance_ratio": round(class_imbalance, 3),
}