algorembrant's picture
Upload 61 files
9cb5a00 verified
"""
ML-3m-trader Labeling Engine
==============================
Generates supervised-learning labels by simulating potential trades
at each bar and checking 1:1 RR outcomes over a lookahead window.
Labels
------
0 = DO_NOTHING (spread filter failed)
1 = BUY (long setup that would hit TP before SL)
2 = SELL (short setup that would hit TP before SL)
3 = HOLD (neither BUY nor SELL produced a winner)
"""
import numpy as np
import pandas as pd
import config as cfg
def _compute_atr(high: np.ndarray, low: np.ndarray, close: np.ndarray,
period: int) -> np.ndarray:
"""Vectorized ATR (Wilder) for the labeler."""
n = len(high)
tr = np.empty(n, dtype=np.float64)
tr[0] = high[0] - low[0]
for i in range(1, n):
tr[i] = max(high[i] - low[i],
abs(high[i] - close[i - 1]),
abs(low[i] - close[i - 1]))
atr = np.empty(n, dtype=np.float64)
atr[:] = np.nan
atr[period - 1] = np.mean(tr[:period])
alpha = 1.0 / period
for i in range(period, n):
atr[i] = atr[i - 1] * (1 - alpha) + tr[i] * alpha
return atr
def generate_labels(df: pd.DataFrame) -> np.ndarray:
"""
For each bar, determine the best label:
1. Compute ATR-based stop-loss distance.
2. If SL distance < spread * SPREAD_FILTER_MULTIPLIER -> DO_NOTHING.
3. For BUY: entry=close, SL=close - sl_dist, TP=close + sl_dist (1:1 RR).
Walk forward up to LABEL_LOOKAHEAD_BARS. If TP hit first -> BUY candidate.
4. For SELL: entry=close, SL=close + sl_dist, TP=close - sl_dist.
Walk forward. If TP hit first -> SELL candidate.
5. If both BUY and SELL win -> pick the one that hits TP sooner.
6. If neither wins -> HOLD.
Parameters
----------
df : pd.DataFrame
Must contain: high, low, close, spread
Returns
-------
np.ndarray of int
Label array aligned with df index.
"""
high = df["high"].values.astype(np.float64)
low = df["low"].values.astype(np.float64)
close = df["close"].values.astype(np.float64)
# Spread: MT5 provides spread in points; convert to price units.
# For XAUUSDc 1 point = 0.01 typically. If spread is already in
# price terms (>1), we use as-is; otherwise multiply by point size.
spread_raw = df["spread"].values.astype(np.float64)
# Heuristic: if median spread < 1, it is likely in points -> convert
point = 0.01 # XAUUSDc standard point
if np.nanmedian(spread_raw) < 1.0:
spread = spread_raw * point
else:
spread = spread_raw
atr = _compute_atr(high, low, close, cfg.ATR_PERIOD)
sl_dist = atr * cfg.ATR_SL_MULTIPLIER
n = len(close)
labels = np.full(n, cfg.LABEL_HOLD, dtype=np.int32)
lookahead = cfg.LABEL_LOOKAHEAD_BARS
spread_mult = cfg.SPREAD_FILTER_MULTIPLIER
for i in range(n):
if np.isnan(sl_dist[i]) or sl_dist[i] <= 0:
labels[i] = cfg.LABEL_DO_NOTHING
continue
# Spread filter
if sl_dist[i] < spread[i] * spread_mult:
labels[i] = cfg.LABEL_DO_NOTHING
continue
entry = close[i]
sd = sl_dist[i]
# BUY scenario
buy_tp = entry + sd
buy_sl = entry - sd
buy_bars = -1 # bars to TP (-1 = never)
# SELL scenario
sell_tp = entry - sd
sell_sl = entry + sd
sell_bars = -1
end = min(i + lookahead + 1, n)
for j in range(i + 1, end):
# Check BUY
if buy_bars == -1:
if low[j] <= buy_sl:
buy_bars = n + 1 # SL hit first -> lose
elif high[j] >= buy_tp:
buy_bars = j - i # TP hit
# Check SELL
if sell_bars == -1:
if high[j] >= sell_sl:
sell_bars = n + 1 # SL hit
elif low[j] <= sell_tp:
sell_bars = j - i # TP hit
# Both resolved
if buy_bars != -1 and sell_bars != -1:
break
buy_won = 0 < buy_bars < n
sell_won = 0 < sell_bars < n
if buy_won and sell_won:
labels[i] = cfg.LABEL_BUY if buy_bars <= sell_bars else cfg.LABEL_SELL
elif buy_won:
labels[i] = cfg.LABEL_BUY
elif sell_won:
labels[i] = cfg.LABEL_SELL
else:
labels[i] = cfg.LABEL_HOLD
# Summary
unique, counts = np.unique(labels, return_counts=True)
dist = {cfg.LABEL_NAMES.get(u, u): int(c) for u, c in zip(unique, counts)}
print(f"[INFO] Label distribution: {dist}")
return labels