File size: 5,015 Bytes
47584e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""
labeler.py — Supervised learning target construction for crypto trading.

Target definition (binary):
    y = 1 if the trade hits RR=1:2 target BEFORE stop within LABEL_FORWARD_BARS
    y = 0 if stop is hit first OR neither hits within the window

Design decisions:
  - Stop and target computed from ATR at signal bar (no lookahead)
  - Realistic costs (fees + slippage) deducted from target threshold
  - Both long and short labeling supported (direction from rule engine)
  - Time-series integrity: labeling uses only forward prices from bar+1
  - NaN label produced when insufficient forward bars exist (dropped later)

Target horizon N = 24 bars (1H timeframe = 1 full trading day):
  - Short enough to avoid regime change within the trade
  - Long enough for 1:2 RR to fully play out
  - Empirically: >24 bars introduces too many confounding events
  - <12 bars under-samples legitimate continuation moves
"""

import numpy as np
import pandas as pd
from typing import Optional

from ml_config import (
    LABEL_FORWARD_BARS,
    STOP_MULT,
    TARGET_RR,
    ROUND_TRIP_COST,
)


def label_single_trade(
    df: pd.DataFrame,
    signal_idx: int,
    atr: float,
    direction: int,  # +1 = long, -1 = short
    forward_bars: int = LABEL_FORWARD_BARS,
) -> Optional[int]:
    """
    Label a single trade signal.

    Args:
        df: Full OHLCV DataFrame (index = timestamp, sorted ascending)
        signal_idx: Integer position of signal bar in df
        atr: ATR value AT signal bar (must be pre-computed, no lookahead)
        direction: +1 long, -1 short
        forward_bars: Max bars to check

    Returns:
        1 = win (target hit first)
        0 = loss (stop hit first or timeout)
        None = insufficient data
    """
    if signal_idx + 1 >= len(df):
        return None

    entry_price = float(df["close"].iloc[signal_idx])
    stop_distance = atr * STOP_MULT

    # Cost-adjusted thresholds: we need price to move further than naive RR
    cost_ticks = entry_price * ROUND_TRIP_COST
    target_distance = stop_distance * TARGET_RR + cost_ticks

    if direction == 1:  # long
        stop_price   = entry_price - stop_distance
        target_price = entry_price + target_distance
    else:  # short
        stop_price   = entry_price + stop_distance
        target_price = entry_price - target_distance

    end_idx = min(signal_idx + 1 + forward_bars, len(df))
    forward = df.iloc[signal_idx + 1 : end_idx]

    if len(forward) == 0:
        return None

    for _, bar in forward.iterrows():
        high = float(bar["high"])
        low  = float(bar["low"])

        if direction == 1:
            # Long: check stop (low) then target (high) — pessimistic ordering
            if low <= stop_price:
                return 0
            if high >= target_price:
                return 1
        else:
            # Short: check stop (high) then target (low)
            if high >= stop_price:
                return 0
            if low <= target_price:
                return 1

    # Neither hit within window = loss (opportunity cost + fees)
    return 0


def label_dataframe(
    df: pd.DataFrame,
    signal_mask: pd.Series,
    atr_series: pd.Series,
    direction_series: pd.Series,
    forward_bars: int = LABEL_FORWARD_BARS,
    min_bars_remaining: int = LABEL_FORWARD_BARS,
) -> pd.Series:
    """
    Label all signal bars in a DataFrame.

    Args:
        df: Full OHLCV DataFrame
        signal_mask: Boolean series, True where a setup was flagged
        atr_series: ATR at each bar (aligned to df index)
        direction_series: +1/-1 for each signal bar
        forward_bars: Max forward window
        min_bars_remaining: Drop labels too close to end of data

    Returns:
        Series of {1, 0, NaN} aligned to df.index
    """
    labels = pd.Series(np.nan, index=df.index, dtype="float64")
    n = len(df)

    signal_positions = np.where(signal_mask.values)[0]

    for pos in signal_positions:
        # Drop signals too close to end of data (insufficient forward bars)
        if pos + min_bars_remaining >= n:
            continue

        atr_val  = float(atr_series.iloc[pos])
        direction = int(direction_series.iloc[pos])

        if np.isnan(atr_val) or direction == 0:
            continue

        label = label_single_trade(df, pos, atr_val, direction, forward_bars)
        if label is not None:
            labels.iloc[pos] = float(label)

    return labels


def compute_label_stats(labels: pd.Series) -> dict:
    """Return win rate, class balance, and label counts for diagnostics."""
    valid = labels.dropna()
    total = len(valid)
    wins  = int((valid == 1).sum())
    losses = int((valid == 0).sum())
    win_rate = wins / total if total > 0 else 0.0
    class_imbalance = wins / losses if losses > 0 else float("inf")
    return {
        "total_labels": total,
        "wins": wins,
        "losses": losses,
        "win_rate": round(win_rate, 4),
        "class_imbalance_ratio": round(class_imbalance, 3),
    }