Spaces:

GoshawkVortexAI
/

Goshawk_Hedge_Pro

Running

File size: 5,015 Bytes

47584e0

"""
labeler.py — Supervised learning target construction for crypto trading.

Target definition (binary):
    y = 1 if the trade hits RR=1:2 target BEFORE stop within LABEL_FORWARD_BARS
    y = 0 if stop is hit first OR neither hits within the window

Design decisions:
  - Stop and target computed from ATR at signal bar (no lookahead)
  - Realistic costs (fees + slippage) deducted from target threshold
  - Both long and short labeling supported (direction from rule engine)
  - Time-series integrity: labeling uses only forward prices from bar+1
  - NaN label produced when insufficient forward bars exist (dropped later)

Target horizon N = 24 bars (1H timeframe = 1 full trading day):
  - Short enough to avoid regime change within the trade
  - Long enough for 1:2 RR to fully play out
  - Empirically: >24 bars introduces too many confounding events
  - <12 bars under-samples legitimate continuation moves
"""

import numpy as np
import pandas as pd
from typing import Optional

from ml_config import (
    LABEL_FORWARD_BARS,
    STOP_MULT,
    TARGET_RR,
    ROUND_TRIP_COST,
)


def label_single_trade(
    df: pd.DataFrame,
    signal_idx: int,
    atr: float,
    direction: int,  # +1 = long, -1 = short
    forward_bars: int = LABEL_FORWARD_BARS,
) -> Optional[int]:
    """
    Label a single trade signal.

    Args:
        df: Full OHLCV DataFrame (index = timestamp, sorted ascending)
        signal_idx: Integer position of signal bar in df
        atr: ATR value AT signal bar (must be pre-computed, no lookahead)
        direction: +1 long, -1 short
        forward_bars: Max bars to check

    Returns:
        1 = win (target hit first)
        0 = loss (stop hit first or timeout)
        None = insufficient data
    """
    if signal_idx + 1 >= len(df):
        return None

    entry_price = float(df["close"].iloc[signal_idx])
    stop_distance = atr * STOP_MULT

    # Cost-adjusted thresholds: we need price to move further than naive RR
    cost_ticks = entry_price * ROUND_TRIP_COST
    target_distance = stop_distance * TARGET_RR + cost_ticks

    if direction == 1:  # long
        stop_price   = entry_price - stop_distance
        target_price = entry_price + target_distance
    else:  # short
        stop_price   = entry_price + stop_distance
        target_price = entry_price - target_distance

    end_idx = min(signal_idx + 1 + forward_bars, len(df))
    forward = df.iloc[signal_idx + 1 : end_idx]

    if len(forward) == 0:
        return None

    for _, bar in forward.iterrows():
        high = float(bar["high"])
        low  = float(bar["low"])

        if direction == 1:
            # Long: check stop (low) then target (high) — pessimistic ordering
            if low <= stop_price:
                return 0
            if high >= target_price:
                return 1
        else:
            # Short: check stop (high) then target (low)
            if high >= stop_price:
                return 0
            if low <= target_price:
                return 1

    # Neither hit within window = loss (opportunity cost + fees)
    return 0


def label_dataframe(
    df: pd.DataFrame,
    signal_mask: pd.Series,
    atr_series: pd.Series,
    direction_series: pd.Series,
    forward_bars: int = LABEL_FORWARD_BARS,
    min_bars_remaining: int = LABEL_FORWARD_BARS,
) -> pd.Series:
    """
    Label all signal bars in a DataFrame.

    Args:
        df: Full OHLCV DataFrame
        signal_mask: Boolean series, True where a setup was flagged
        atr_series: ATR at each bar (aligned to df index)
        direction_series: +1/-1 for each signal bar
        forward_bars: Max forward window
        min_bars_remaining: Drop labels too close to end of data

    Returns:
        Series of {1, 0, NaN} aligned to df.index
    """
    labels = pd.Series(np.nan, index=df.index, dtype="float64")
    n = len(df)

    signal_positions = np.where(signal_mask.values)[0]

    for pos in signal_positions:
        # Drop signals too close to end of data (insufficient forward bars)
        if pos + min_bars_remaining >= n:
            continue

        atr_val  = float(atr_series.iloc[pos])
        direction = int(direction_series.iloc[pos])

        if np.isnan(atr_val) or direction == 0:
            continue

        label = label_single_trade(df, pos, atr_val, direction, forward_bars)
        if label is not None:
            labels.iloc[pos] = float(label)

    return labels


def compute_label_stats(labels: pd.Series) -> dict:
    """Return win rate, class balance, and label counts for diagnostics."""
    valid = labels.dropna()
    total = len(valid)
    wins  = int((valid == 1).sum())
    losses = int((valid == 0).sum())
    win_rate = wins / total if total > 0 else 0.0
    class_imbalance = wins / losses if losses > 0 else float("inf")
    return {
        "total_labels": total,
        "wins": wins,
        "losses": losses,
        "win_rate": round(win_rate, 4),
        "class_imbalance_ratio": round(class_imbalance, 3),
    }