DHAN_AI_DATA / p170_max_system.py

P170 10yr model + code — 2026-05-15

b22f9ac verified 26 days ago

54.4 kB

	"""
	P170 MAXIMUM SYSTEM — dhan_financial_intelligence
	==================================================
	Confirmed data layout:
	data/lake/clean/market_data/NSE/SYMBOL_real_market_data_v2.parquet
	columns: [timestamp, open, high, low, close, volume] ← 2155 symbols
	data/lake/ml/features/NSE/SYMBOL_features_ml_dataset_v1.parquet
	→ only 5 symbols (HDFCBANK, ICICIBANK, INFY, RELIANCE, TCS) — fallback only

	All outputs go to data/lake/ml/p170/ — nothing existing is touched.

	Usage:
	cd /home/user1/dhan_financial_intelligence
	source .venv/bin/activate
	pip install lightgbm catboost --break-system-packages # recommended
	python p170_max_system.py --check-deps
	python p170_max_system.py --symbol RELIANCE
	python p170_max_system.py --all-symbols --top-n 50
	python p170_max_system.py --all-symbols
	python p170_max_system.py --predict RELIANCE
	"""

	import sys
	import json
	import time
	import logging
	import warnings
	import argparse
	import traceback
	from pathlib import Path
	from datetime import datetime
	from typing import Optional, List, Tuple

	import numpy as np
	import pandas as pd

	warnings.filterwarnings("ignore")

	# ── Logging ───────────────────────────────────────────────────────────────────
	Path("logs/ml").mkdir(parents=True, exist_ok=True)
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	handlers=[
	logging.StreamHandler(sys.stdout),
	logging.FileHandler(f"logs/ml/p170_{datetime.now():%Y%m%d_%H%M%S}.log"),
	],
	)
	log = logging.getLogger("p170")

	# ─────────────────────────────────────────────────────────────────────────────
	# PATHS
	# ─────────────────────────────────────────────────────────────────────────────
	BASE = Path(".")
	CLEAN_DIR = BASE / "data/lake/clean/market_data/NSE" # 2155 *_real_market_data_v2.parquet
	FEAT_DIR = BASE / "data/lake/ml/features/NSE" # 5 *_features_ml_dataset_v1.parquet
	OUT = BASE / "data/lake/ml/p170"
	MODELS_DIR = OUT / "models"
	METRICS_DIR = OUT / "metrics"
	PREDS_DIR = OUT / "predictions"
	REGISTRY_DIR = OUT / "registry"

	for d in [MODELS_DIR, METRICS_DIR, PREDS_DIR, REGISTRY_DIR]:
	d.mkdir(parents=True, exist_ok=True)

	# ─────────────────────────────────────────────────────────────────────────────
	# LABEL CONFIG (matches existing policy exactly)
	# ─────────────────────────────────────────────────────────────────────────────
	TARGET = "direction_class" # 0=down 1=neutral 2=up
	HORIZON = 3 # 3-day forward return
	POS_TH = 0.015 # ≥+1.5% → up (was 0.5% → gave 9% neutral, unusable)
	NEG_TH = -0.015 # ≤-1.5% → down (±1.5% gives ~33/33/33 split)
	TRAIN_R = 0.70
	VAL_R = 0.15
	TEST_R = 0.15
	WF_FOLDS = 5
	MIN_ROWS = 80
	SEED = 42
	np.random.seed(SEED)


	# ─────────────────────────────────────────────────────────────────────────────
	# SYMBOL DISCOVERY
	# ─────────────────────────────────────────────────────────────────────────────

	def discover_symbols() -> List[Tuple[str, Path]]:
	"""
	Returns [(symbol, parquet_path), ...] sorted largest-file-first (most data first).
	Strips _real_market_data_v2 to get clean tickers.
	"""
	seen: dict = {}

	# Primary: 2155 clean market data files
	for f in CLEAN_DIR.glob("*_real_market_data_v2.parquet"):
	sym = f.stem.replace("_real_market_data_v2", "").upper()
	if sym:
	seen[sym] = f

	# Fallback: 5 legacy feature files (only adds symbols not already in clean data)
	for f in FEAT_DIR.glob("*_features_ml_dataset_v1.parquet"):
	sym = f.stem.replace("_features_ml_dataset_v1", "").upper()
	if sym and sym not in seen:
	seen[sym] = f

	pairs = list(seen.items())
	pairs.sort(key=lambda x: x[1].stat().st_size, reverse=True)
	return pairs


	# ─────────────────────────────────────────────────────────────────────────────
	# DATA LOADING
	# ─────────────────────────────────────────────────────────────────────────────

	def load_candles(symbol: str) -> Optional[pd.DataFrame]:
	"""
	Load OHLCV for symbol. Returns DataFrame with column 'date' (datetime).
	Clean market data files use 'timestamp' — renamed here.
	Returns None if not found or too small.
	"""
	candidates = [
	CLEAN_DIR / f"{symbol}_real_market_data_v2.parquet", # primary (2155 symbols)
	CLEAN_DIR / f"{symbol}.parquet", # alternative naming
	FEAT_DIR / f"{symbol}_features_ml_dataset_v1.parquet", # legacy fallback
	]

	for path in candidates:
	if not path.exists():
	continue
	try:
	df = pd.read_parquet(path)

	# Normalise date column
	if "date" not in df.columns:
	if "timestamp" in df.columns:
	df = df.rename(columns={"timestamp": "date"})
	elif df.index.name:
	df = df.reset_index().rename(columns={df.index.name: "date"})
	else:
	continue

	df["date"] = pd.to_datetime(df["date"])

	# Need full OHLCV — feature-only files don't have open/high/low
	if not all(c in df.columns for c in ["open", "high", "low", "close", "volume"]):
	continue

	df = (df[["date", "open", "high", "low", "close", "volume"]]
	.dropna(subset=["close", "volume"])
	.sort_values("date")
	.reset_index(drop=True))

	if len(df) >= MIN_ROWS:
	return df

	except Exception as e:
	log.debug(f"Could not load {path}: {e}")
	continue

	return None


	# ─────────────────────────────────────────────────────────────────────────────
	# LAYER 0 — FEATURE ENGINEERING (87 features)
	# ─────────────────────────────────────────────────────────────────────────────


	# ── NIFTYBEES market index cache ──────────────────────────────────────────────
	_NIFTYBEES_CACHE: dict = {}

	def _load_niftybees() -> "pd.DataFrame \| None":
	"""
	Load NIFTYBEES (Nifty50 proxy) and compute market-level features.
	Cached after first load. Returns DataFrame with columns:
	[date, mkt_ret_1d, mkt_ret_5d, mkt_vol_20d, mkt_above_ma20]
	"""
	global _NIFTYBEES_CACHE
	if "df" in _NIFTYBEES_CACHE:
	return _NIFTYBEES_CACHE["df"]

	path = CLEAN_DIR / "NIFTYBEES_real_market_data_v2.parquet"
	if not path.exists():
	_NIFTYBEES_CACHE["df"] = None
	return None

	try:
	m = pd.read_parquet(path)
	if "timestamp" in m.columns:
	m = m.rename(columns={"timestamp": "date"})
	m["date"] = pd.to_datetime(m["date"])
	m = m.sort_values("date").reset_index(drop=True)

	mc = m["close"].astype(float)
	m["mkt_ret_1d"] = mc.pct_change(1)
	m["mkt_ret_5d"] = mc.pct_change(5)
	m["mkt_vol_20d"] = m["mkt_ret_1d"].rolling(20).std()
	m["mkt_ma20"] = mc.rolling(20).mean()
	m["mkt_above_ma20"] = (mc > m["mkt_ma20"]).astype(int)

	# Keep only the columns we need for merging
	mkt_out = m[["date","mkt_ret_1d","mkt_ret_5d",
	"mkt_vol_20d","mkt_above_ma20"]].copy()
	# ffill/bfill warmup NaNs
	for col in ["mkt_ret_1d","mkt_ret_5d","mkt_vol_20d","mkt_above_ma20"]:
	mkt_out[col] = mkt_out[col].ffill().bfill().fillna(0)

	_NIFTYBEES_CACHE["df"] = mkt_out
	return mkt_out
	except Exception as e:
	log.debug(f"NIFTYBEES load failed: {e}")
	_NIFTYBEES_CACHE["df"] = None
	return None


	# ── Cross-asset data cache (yfinance) ─────────────────────────────────────────
	_CROSS_ASSET_CACHE: dict = {}

	def _load_cross_asset(ticker: str, key: str) -> "pd.DataFrame \| None":
	"""Load cross-asset daily returns. Reads from parquet cache first, falls back to yfinance."""
	global _CROSS_ASSET_CACHE
	if key in _CROSS_ASSET_CACHE:
	return _CROSS_ASSET_CACHE[key]

	# Parquet name mapping
	parquet_map = {"nifty": "NIFTY50", "usdinr": "USDINR", "crude": "CRUDE"}
	pname = parquet_map.get(key)
	cross_path = BASE / f"data/lake/clean/market_data/CROSS/{pname}.parquet" if pname else None

	try:
	if cross_path and cross_path.exists():
	raw = pd.read_parquet(cross_path)
	raw["date"] = pd.to_datetime(raw["date"]).dt.tz_localize(None)
	result = raw[["date", "ret_1d", "ret_5d"]].copy()
	_CROSS_ASSET_CACHE[key] = result
	log.debug(f"Cross-asset {key}: loaded {len(result)} rows from parquet")
	return result

	# Fallback: yfinance
	import yfinance as yf
	raw = yf.download(ticker, period="max", progress=False, auto_adjust=True)
	if raw is None or raw.empty:
	_CROSS_ASSET_CACHE[key] = None
	return None
	if isinstance(raw.columns, pd.MultiIndex):
	raw.columns = [c[0].lower() for c in raw.columns]
	else:
	raw.columns = [c.lower() for c in raw.columns]
	raw = raw.reset_index()
	raw = raw.rename(columns={raw.columns[0]: "date"})
	raw["date"] = pd.to_datetime(raw["date"]).dt.tz_localize(None)
	raw["ret_1d"] = raw["close"].pct_change(1)
	raw["ret_5d"] = raw["close"].pct_change(5)
	result = raw[["date", "ret_1d", "ret_5d"]].copy()
	_CROSS_ASSET_CACHE[key] = result
	return result
	except Exception as e:
	log.debug(f"Cross-asset {ticker} failed: {e}")
	_CROSS_ASSET_CACHE[key] = None
	return None


	def _hurst_rs(prices: np.ndarray) -> float:
	"""Hurst exponent via R/S analysis. >0.5=trending, <0.5=mean-reverting."""
	try:
	n = len(prices)
	if n < 20:
	return 0.5
	mean_adj = prices - np.mean(prices)
	cs = np.cumsum(mean_adj)
	r = np.max(cs) - np.min(cs)
	s = np.std(prices, ddof=1)
	if s == 0:
	return 0.5
	return float(np.log(r / s) / np.log(n))
	except Exception:
	return 0.5

	def engineer_features(raw: pd.DataFrame) -> pd.DataFrame:
	"""
	Input : OHLCV DataFrame with [date, open, high, low, close, volume]
	Output: same rows with 87 feature columns + future_return + direction_class
	"""
	df = raw.copy()
	c = df["close"].astype(float)
	h = df["high"].astype(float)
	l = df["low"].astype(float)
	o = df["open"].astype(float)
	v = df["volume"].astype(float).replace(0, np.nan)

	# ── Returns ───────────────────────────────────────────────────────────────
	# ── Keltner Channels ─────────────────────────────────────────────────────
	keltner_mid = c.ewm(span=20, adjust=False).mean()
	_tr_k = pd.concat([h-l, (h-c.shift()).abs(), (l-c.shift()).abs()], axis=1).max(axis=1)
	atr14_k = _tr_k.ewm(span=14, adjust=False).mean()
	df["keltner_width"] = (keltner_mid + 2atr14_k - (keltner_mid - 2atr14_k)) / (keltner_mid + 1e-9)

	# ── Historical volatility (annualised) ────────────────────────────────────
	log_ret = np.log(c / (c.shift(1) + 1e-9))
	for w in [10, 20, 60]:
	df[f"hist_vol_{w}"] = log_ret.rolling(w).std() * np.sqrt(252)

	# ── Money Flow Index ─────────────────────────────────────────────────────
	typical = (h + l + c) / 3
	raw_mf = typical * v
	pos_mf = raw_mf.where(typical > typical.shift(1), 0).rolling(14).sum()
	neg_mf = raw_mf.where(typical < typical.shift(1), 0).rolling(14).sum()
	df["mfi_14"] = 100 - (100 / (1 + pos_mf / (neg_mf + 1e-9)))

	# ── Chaikin Money Flow ────────────────────────────────────────────────────
	clv = ((c - l) - (h - c)) / (h - l + 1e-9)
	df["cmf_20"] = (clv * v).rolling(20).sum() / (v.rolling(20).sum() + 1e-9)

	# ── VWAP distance ─────────────────────────────────────────────────────────
	typical_p = (h + l + c) / 3
	vwap = (typical_p * v).rolling(20).sum() / (v.rolling(20).sum() + 1e-9)
	df["vwap_dist"] = (c - vwap) / (vwap + 1e-9)

	# ── Amihud illiquidity ────────────────────────────────────────────────────
	df["amihud_illiquidity"] = (c.pct_change().abs() / (v * c + 1e-9)).rolling(20).mean()

	# ── Volume ratios ─────────────────────────────────────────────────────────
	df["vol_ratio_5"] = v / (v.rolling(5).mean() + 1e-9)
	df["vol_ratio_20"] = v / (v.rolling(20).mean() + 1e-9)

	# ── OBV divergence ────────────────────────────────────────────────────────
	obv_raw = (np.sign(c.diff()) * v).cumsum()
	obv_ema = obv_raw.ewm(span=20, adjust=False).mean()
	df["obv_divergence"] = obv_raw - obv_ema

	# ── Williams %R ───────────────────────────────────────────────────────────
	df["williams_r"] = -100 * (h.rolling(14).max() - c) / (h.rolling(14).max() - l.rolling(14).min() + 1e-9)

	# ── CCI ───────────────────────────────────────────────────────────────────
	df["cci_20"] = (c - c.rolling(20).mean()) / (0.015 * c.rolling(20).std() + 1e-9)

	# ── MACD cross up (computed after MACD section) ─────────────────────────
	if "macd" in df.columns and "macd_sig" in df.columns:
	df["macd_cross_up"] = (
	(df["macd"] > df["macd_sig"]) & (df["macd"].shift(1) <= df["macd_sig"].shift(1))
	).astype(int)
	else:
	df["macd_cross_up"] = 0

	# ── Stoch cross up ────────────────────────────────────────────────────────
	if "stoch_k_14" in df.columns and "stoch_d_14" in df.columns:
	df["stoch_cross_up"] = (
	(df["stoch_k_14"] > df["stoch_d_14"]) &
	(df["stoch_k_14"].shift(1) <= df["stoch_d_14"].shift(1))
	).astype(int)
	else:
	df["stoch_cross_up"] = 0

	# ── Candlestick patterns ─────────────────────────────────────────────────
	body = (c - o).abs()
	full_range = (h - l).replace(0, np.nan)
	upper_sh = (h - pd.concat([c, o], axis=1).max(axis=1)) / full_range
	lower_sh = (pd.concat([c, o], axis=1).min(axis=1) - l) / full_range
	df["gap_up"] = ((o - c.shift(1)) / (c.shift(1) + 1e-9)).clip(lower=0)
	df["gap_down"] = ((c.shift(1) - o) / (c.shift(1) + 1e-9)).clip(lower=0)
	df["inside_bar"] = ((h < h.shift(1)) & (l > l.shift(1))).astype(int)
	df["doji"] = (body / (full_range + 1e-9) < 0.1).astype(int)
	df["hammer"] = ((lower_sh > 2 * body / (full_range + 1e-9)) & (upper_sh < 0.3)).astype(int)
	df["shooting_star"] = ((upper_sh > 2 * body / (full_range + 1e-9)) & (lower_sh < 0.3)).astype(int)
	df["engulfing_bull"] = (
	(c > o) & (o < c.shift(1)) & (c > o.shift(1)) & (c.shift(1) < o.shift(1))
	).astype(int)
	df["engulfing_bear"] = (
	(c < o) & (o > c.shift(1)) & (c < o.shift(1)) & (c.shift(1) > o.shift(1))
	).astype(int)

	# ── Drawdown from recent high ─────────────────────────────────────────────
	df["dd_from_high_20"] = (c - c.rolling(20).max()) / (c.rolling(20).max() + 1e-9)
	df["dd_from_high_60"] = (c - c.rolling(60).max()) / (c.rolling(60).max() + 1e-9)

	# ── Rolling Sharpe ────────────────────────────────────────────────────────
	log_ret_s = np.log(c / (c.shift(1) + 1e-9))
	df["rolling_sharpe_20"] = (log_ret_s.rolling(20).mean() / (log_ret_s.rolling(20).std() + 1e-9)) * np.sqrt(252)
	df["rolling_sharpe_60"] = (log_ret_s.rolling(60).mean() / (log_ret_s.rolling(60).std() + 1e-9)) * np.sqrt(252)

	# ── Autocorrelation (trend persistence proxy, faster than Hurst) ──────────
	df["hurst_50"] = c.pct_change().rolling(20).apply(lambda x: x.autocorr(lag=1) if len(x)>5 else 0, raw=False)

	# ── Cross-asset macro features ────────────────────────────────────────────
	try:
	nifty_df = _load_cross_asset("^NSEI", "nifty")
	usdinr_df = _load_cross_asset("USDINR=X", "usdinr")
	crude_df = _load_cross_asset("CL=F", "crude")

	if "date" in df.columns:
	df["date"] = pd.to_datetime(df["date"])
	for cross, prefix in [(nifty_df,"nifty"), (usdinr_df,"usdinr"), (crude_df,"crude")]:
	if cross is not None:
	merged = df[["date"]].merge(cross, on="date", how="left")
	if prefix == "nifty":
	df["nifty_ret_1d"] = merged["ret_1d"].values
	df["nifty_ret_5d"] = merged["ret_5d"].values
	# Beta: cov(stock, nifty) / var(nifty)
	sr = c.pct_change()
	nr = pd.Series(merged["ret_1d"].values, index=df.index)
	df["beta_20d"] = sr.rolling(20).cov(nr) / (nr.rolling(20).var() + 1e-9)
	elif prefix == "usdinr":
	df["usd_inr_ret"] = merged["ret_1d"].values
	elif prefix == "crude":
	df["crude_ret"] = merged["ret_1d"].values
	except Exception as e:
	log.debug(f"Cross-asset features skipped: {e}")
	for col in ["nifty_ret_1d","nifty_ret_5d","beta_20d","usd_inr_ret","crude_ret"]:
	if col not in df.columns:
	df[col] = 0.0

	for p in [1, 3, 5, 10, 20]:
	df[f"return_{p}d"] = c.pct_change(p)

	# ── Volatility ────────────────────────────────────────────────────────────
	r1 = df["return_1d"]
	for w in [5, 10, 20]:
	df[f"volatility_{w}d"] = r1.rolling(w).std()
	df["vol_ratio"] = df["volatility_5d"] / (df["volatility_20d"] + 1e-9)
	df["skew_20"] = r1.rolling(20).skew()
	df["kurt_20"] = r1.rolling(20).kurt()

	# ── Moving averages ───────────────────────────────────────────────────────
	for w in [5, 10, 20, 50]:
	df[f"ma_{w}"] = c.rolling(w).mean()
	df[f"close_to_ma_{w}"] = (c - df[f"ma_{w}"]) / (df[f"ma_{w}"] + 1e-9)

	# ── EMA alignment ─────────────────────────────────────────────────────────
	ema5 = c.ewm(span=5, adjust=False).mean()
	ema20 = c.ewm(span=20, adjust=False).mean()
	ema50 = c.ewm(span=50, adjust=False).mean()
	df["ema_5_20_ratio"] = ema5 / (ema20 + 1e-9)
	df["ema_20_50_ratio"] = ema20 / (ema50 + 1e-9)
	df["ema_align"] = (ema5 > ema20).astype(int) + (ema20 > ema50).astype(int)

	# ── RSI (7, 14, 21) ───────────────────────────────────────────────────────
	delta = c.diff()
	gain = delta.clip(lower=0)
	loss = (-delta).clip(lower=0)
	for period in [7, 14, 21]:
	ag = gain.ewm(com=period - 1, min_periods=period).mean()
	al = loss.ewm(com=period - 1, min_periods=period).mean()
	df[f"rsi_{period}"] = 100 - 100 / (1 + ag / (al + 1e-9))

	# ── MACD ──────────────────────────────────────────────────────────────────
	ema12 = c.ewm(span=12, adjust=False).mean()
	ema26 = c.ewm(span=26, adjust=False).mean()
	df["macd"] = ema12 - ema26
	df["macd_sig"] = df["macd"].ewm(span=9, adjust=False).mean()
	df["macd_hist"] = df["macd"] - df["macd_sig"]
	df["macd_cross"] = (df["macd"] > df["macd_sig"]).astype(int)

	# ── Bollinger Bands ───────────────────────────────────────────────────────
	for w in [10, 20]:
	mid = c.rolling(w).mean()
	std = c.rolling(w).std()
	bb_u = mid + 2 * std
	bb_l = mid - 2 * std
	df[f"bb_width_{w}"] = (bb_u - bb_l) / (mid + 1e-9)
	df[f"bb_pct_{w}"] = (c - bb_l) / (bb_u - bb_l + 1e-9)

	# ── ATR ───────────────────────────────────────────────────────────────────
	tr = pd.concat(
	[h - l, (h - c.shift()).abs(), (l - c.shift()).abs()], axis=1
	).max(axis=1)
	for w in [7, 14]:
	atr = tr.ewm(span=w, adjust=False).mean()
	df[f"atr_{w}"] = atr
	df[f"atr_pct_{w}"] = atr / (c + 1e-9)

	# ── ADX / DI ──────────────────────────────────────────────────────────────
	plus_dm = (h - h.shift(1)).clip(lower=0)
	minus_dm = (l.shift(1) - l).clip(lower=0)
	plus_dm = plus_dm.where(plus_dm > minus_dm, 0)
	minus_dm = minus_dm.where(minus_dm > plus_dm, 0)
	atr14 = tr.ewm(span=14, adjust=False).mean() + 1e-9
	plus_di = 100 * plus_dm.ewm(span=14, adjust=False).mean() / atr14
	minus_di = 100 * minus_dm.ewm(span=14, adjust=False).mean() / atr14
	dx = 100 * (plus_di - minus_di).abs() / (plus_di + minus_di + 1e-9)
	df["adx_14"] = dx.ewm(span=14, adjust=False).mean()
	df["plus_di_14"] = plus_di
	df["minus_di_14"] = minus_di

	# ── Stochastics ───────────────────────────────────────────────────────────
	for k in [5, 14]:
	lo_k = l.rolling(k).min()
	hi_k = h.rolling(k).max()
	stoch_k = 100 * (c - lo_k) / (hi_k - lo_k + 1e-9)
	df[f"stoch_k_{k}"] = stoch_k
	df[f"stoch_d_{k}"] = stoch_k.rolling(3).mean()

	# ── Donchian channel position ──────────────────────────────────────────────
	for w in [10, 20]:
	dc_hi = h.rolling(w).max()
	dc_lo = l.rolling(w).min()
	df[f"donchian_{w}"] = (c - dc_lo) / (dc_hi - dc_lo + 1e-9)

	# ── Candle structure ──────────────────────────────────────────────────────
	rng = h - l + 1e-9
	df["candle_body"] = (c - o) / rng
	df["upper_shadow"] = (h - c.clip(lower=o)) / rng
	df["lower_shadow"] = (o.clip(upper=c) - l) / rng
	df["gap"] = (o - c.shift(1)) / (c.shift(1) + 1e-9)

	# ── Volume features ───────────────────────────────────────────────────────
	for w in [5, 20]:
	vmean = v.rolling(w).mean()
	vstd = v.rolling(w).std() + 1e-9
	df[f"volume_z_{w}"] = (v - vmean) / vstd
	obv = (np.sign(r1) * v).cumsum()
	df["obv_z"] = (obv - obv.rolling(20).mean()) / (obv.rolling(20).std() + 1e-9)
	df["vpt_5"] = (r1 * v).rolling(5).sum()

	# ── Momentum / ROC ────────────────────────────────────────────────────────
	for p in [3, 5, 10, 20]:
	df[f"roc_{p}"] = c.pct_change(p)
	df["momentum_score"] = (
	df["roc_3"].rank(pct=True) * 0.4
	+ df["roc_5"].rank(pct=True) * 0.3
	+ df["roc_10"].rank(pct=True) * 0.2
	+ df["roc_20"].rank(pct=True) * 0.1
	)


	# ── Market regime features (NIFTYBEES as Nifty50 proxy) ──────────────────
	# Cross-sectional context: is this stock outperforming the market?
	# Loaded once, merged by date, gracefully skipped if unavailable.
	try:
	mkt = _load_niftybees()
	if mkt is not None and "date" in df.columns:
	df["date"] = pd.to_datetime(df["date"])
	df = df.merge(mkt, on="date", how="left")
	# stock vs market relative strength
	df["stock_vs_mkt_5d"] = df["return_5d"] - df["mkt_ret_5d"].fillna(0)
	df["stock_vs_mkt_20d"] = df["return_20d"] - df["mkt_ret_5d"].fillna(0) * 4
	mkt_cols = ["mkt_ret_1d","mkt_ret_5d","mkt_vol_20d",
	"mkt_above_ma20","stock_vs_mkt_5d","stock_vs_mkt_20d"]
	df[mkt_cols] = df[mkt_cols].ffill().bfill().fillna(0)
	except Exception as e:
	log.debug(f"Market regime features skipped: {e}")

	# ── Target label (matches existing policy exactly) ───────────────────────
	fwd = c.shift(-HORIZON) / c - 1
	df["future_return"] = fwd
	df[TARGET] = 1 # neutral default
	df.loc[fwd >= POS_TH, TARGET] = 2 # up
	df.loc[fwd <= NEG_TH, TARGET] = 0 # down

	# Forward-fill warmup NaNs (first ~50 rows) so dropna doesn't slash feature count.
	# Only fills leading NaNs — does NOT touch mid-series NaNs which are real gaps.
	feat_cols_now = [col for col in df.columns
	if col not in {"date","open","high","low","close","volume",
	"future_return", TARGET}]
	df[feat_cols_now] = df[feat_cols_now].ffill().bfill()

	return df


	def feature_columns(df: pd.DataFrame) -> List[str]:
	drop = {"date","open","high","low","close","volume",
	"symbol","exchange","timestamp","future_return", TARGET}
	return [c for c in df.columns if c not in drop]


	# ─────────────────────────────────────────────────────────────────────────────
	# LAYER 1 — BASE MODELS
	# ─────────────────────────────────────────────────────────────────────────────

	def make_base_models(use_gpu: bool) -> dict:
	from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
	import xgboost as xgb

	gpu_xgb = {"device": "cuda", "tree_method": "hist"} if use_gpu else {}

	models = {
	"xgboost": xgb.XGBClassifier(
	n_estimators=300, max_depth=5, learning_rate=0.05,
	subsample=0.8, colsample_bytree=0.8,
	min_child_weight=3, gamma=0.1, reg_alpha=0.1, reg_lambda=1.0,
	objective="multi:softprob", num_class=3, eval_metric="mlogloss",
	random_state=SEED, verbosity=0, n_jobs=-1, **gpu_xgb,
	),
	"random_forest": RandomForestClassifier(
	n_estimators=200, max_depth=8, min_samples_leaf=5,
	max_features="sqrt", class_weight="balanced",
	random_state=SEED, n_jobs=-1,
	),
	"extra_trees": ExtraTreesClassifier(
	n_estimators=200, max_depth=10, min_samples_leaf=3,
	class_weight="balanced", random_state=SEED, n_jobs=-1,
	),
	}

	try:
	import lightgbm as lgb
	models["lightgbm"] = lgb.LGBMClassifier(
	n_estimators=300, num_leaves=63, learning_rate=0.05,
	subsample=0.8, colsample_bytree=0.8, min_child_samples=10,
	reg_alpha=0.1, reg_lambda=1.0,
	objective="multiclass", num_class=3,
	random_state=SEED, verbose=-1, n_jobs=-1,
	)
	log.info(" LightGBM ✓")
	except ImportError:
	log.warning(" LightGBM not installed (pip install lightgbm --break-system-packages)")

	try:
	from catboost import CatBoostClassifier
	cb_gpu = {"task_type": "GPU"} if use_gpu else {"task_type": "CPU"}
	models["catboost"] = CatBoostClassifier(
	iterations=300, depth=6, learning_rate=0.05, l2_leaf_reg=3,
	loss_function="MultiClass", classes_count=3,
	random_seed=SEED, verbose=False, **cb_gpu,
	)
	log.info(" CatBoost ✓")
	except ImportError:
	log.warning(" CatBoost not installed (pip install catboost --break-system-packages)")

	return models


	# ─────────────────────────────────────────────────────────────────────────────
	# LAYER 2 — PYTORCH META-LEARNER
	# ─────────────────────────────────────────────────────────────────────────────

	def _meta_net(in_dim: int):
	import torch.nn as nn
	return nn.Sequential(
	nn.Linear(in_dim, 64), nn.LayerNorm(64), nn.GELU(), nn.Dropout(0.2),
	nn.Linear(64, 32), nn.GELU(), nn.Dropout(0.1),
	nn.Linear(32, 3),
	)


	def train_meta_torch(net, X: np.ndarray, y: np.ndarray, device, epochs: int = 60):
	import torch, torch.nn as nn
	net = net.to(device)
	opt = torch.optim.AdamW(net.parameters(), lr=1e-3, weight_decay=1e-4)
	sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs)
	crit = nn.CrossEntropyLoss()
	Xt = torch.FloatTensor(X).to(device)
	yt = torch.LongTensor(y).to(device)
	net.train()
	for _ in range(epochs):
	opt.zero_grad()
	loss = crit(net(Xt), yt)
	loss.backward()
	torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0)
	opt.step(); sched.step()
	net.eval()
	return net


	def predict_meta_torch(net, X: np.ndarray, device) -> np.ndarray:
	import torch
	with torch.no_grad():
	return torch.softmax(net(torch.FloatTensor(X).to(device)), dim=1).cpu().numpy()


	# ─────────────────────────────────────────────────────────────────────────────
	# GLOBAL MODEL — train on ALL symbols combined (~662K rows)
	# This is the right architecture when each symbol only has ~300 rows.
	# One model learns market-wide patterns; per-symbol fine-tuning adds specificity.
	# ─────────────────────────────────────────────────────────────────────────────

	def build_global_dataset(
	pairs: List[Tuple[str, Path]],
	max_symbols: Optional[int] = None,
	) -> Tuple[np.ndarray, np.ndarray, np.ndarray, List[str], List[str]]:
	"""
	Load and engineer features for all symbols, stack into one big matrix.
	Returns (X, y, symbol_ids, feature_cols, symbol_list)
	Uses the LAST 20% of each symbol's rows as test — everything else is train/val.
	Symbol IDs allow per-symbol evaluation after global training.
	"""
	all_X, all_y, all_syms, all_dates = [], [], [], []
	feature_cols_ref = None
	skipped = 0

	# Re-apply equity filter in case pairs came from unfiltered source
	pairs = [(s,p) for s,p in pairs if not any(
	x in s.upper() for x in [
	'LIQUID','GILT','GSEC','TBILL','CASH','GOLD','SILVER','METAL',
	'BBETF','EBBETF','BEES','NIFTY','LIQUIDADD','LIQUIDETF',
	'CASHIETF','AXISBPS','LICNETF',
	]
	) and not s.upper().endswith(('ETF','BEES','LIQUID','ADD','IETF'))]

	if max_symbols:
	pairs = pairs[:max_symbols]

	log.info(f" Loading {len(pairs)} equity symbols...")
	for i, (sym, _) in enumerate(pairs):
	if i % 200 == 0:
	log.info(f" {i}/{len(pairs)} loaded, {sum(len(x) for x in all_X):,} rows so far")
	try:
	raw = load_candles(sym)
	if raw is None:
	skipped += 1
	continue
	df = engineer_features(raw)
	fcols = feature_columns(df)
	df = df.dropna(subset=fcols + [TARGET])
	df = df[df[TARGET].isin([0, 1, 2])].reset_index(drop=True)
	if len(df) < MIN_ROWS:
	skipped += 1
	continue

	# Align feature columns across symbols
	if feature_cols_ref is None:
	feature_cols_ref = fcols
	else:
	fcols = feature_cols_ref # use first symbol's cols as reference

	# Only use columns that exist in this df
	available = [c for c in fcols if c in df.columns]
	X_sym = np.zeros((len(df), len(fcols)), dtype=np.float32)
	for j, fc in enumerate(fcols):
	if fc in df.columns:
	X_sym[:, j] = df[fc].values.astype(np.float32)

	all_X.append(X_sym)
	all_y.append(df[TARGET].values.astype(np.int32))
	all_syms.extend([sym] * len(df))
	if "date" in df.columns:
	all_dates.extend(df["date"].tolist())
	else:
	all_dates.extend([None] * len(df))

	except Exception as e:
	log.debug(f" [{sym}] skipped: {e}")
	skipped += 1

	log.info(f" Loaded: {len(pairs)-skipped} symbols, {skipped} skipped")

	if not all_X:
	raise ValueError("No data loaded — check CLEAN_DIR path")

	X = np.vstack(all_X)
	y = np.concatenate(all_y)
	sym_ids = np.array(all_syms)

	log.info(f" Combined dataset: {X.shape[0]:,} rows × {X.shape[1]} features")
	cls, cnts = np.unique(y, return_counts=True)
	for c, n in zip(cls, cnts):
	log.info(f" class {c}: {n:,} ({n/len(y)*100:.1f}%)")

	return X, y, sym_ids, feature_cols_ref or [], all_dates


	def temporal_global_split(
	sym_ids: np.ndarray,
	all_dates: List,
	test_ratio: float = 0.20,
	) -> Tuple[np.ndarray, np.ndarray]:
	"""
	For each symbol, put last test_ratio of its rows in test, rest in train.
	This prevents future leakage while keeping temporal structure.
	Returns (train_mask, test_mask).
	"""
	n = len(sym_ids)
	train_mask = np.ones(n, dtype=bool)
	test_mask = np.zeros(n, dtype=bool)

	symbols = np.unique(sym_ids)
	for sym in symbols:
	idx = np.where(sym_ids == sym)[0]
	cut = int(len(idx) * (1 - test_ratio))
	test_mask[idx[cut:]] = True
	train_mask[idx[cut:]] = False

	return train_mask, test_mask


	def train_global_model(use_gpu: bool = True, max_symbols: Optional[int] = None) -> dict:
	"""
	Train one ensemble on all ~662K rows combined.
	Saves models to data/lake/ml/p170/models/GLOBAL/
	Returns metrics dict.
	"""
	import joblib
	from sklearn.preprocessing import RobustScaler
	from sklearn.metrics import accuracy_score, classification_report

	log.info(f"\n{'#'*54}")
	log.info(f" GLOBAL MODEL TRAINING")
	log.info(f" ~662K rows across 2150 NSE symbols")
	log.info(f"{'#'*54}\n")
	t0 = time.time()

	# ── Load all data ─────────────────────────────────────────────────────────
	pairs = discover_symbols()
	X, y, sym_ids, fcols, all_dates = build_global_dataset(pairs, max_symbols=max_symbols)

	# ── Temporal split per symbol ─────────────────────────────────────────────
	train_mask, test_mask = temporal_global_split(sym_ids, all_dates, test_ratio=0.20)

	# Use 80% of training rows for train, 20% for val (within the train set)
	train_idx = np.where(train_mask)[0]
	val_cut = int(len(train_idx) * 0.80)
	val_idx = train_idx[val_cut:]
	tr_idx = train_idx[:val_cut]
	te_idx = np.where(test_mask)[0]

	X_tr, y_tr = X[tr_idx], y[tr_idx]
	X_val, y_val = X[val_idx], y[val_idx]
	X_te, y_te = X[te_idx], y[te_idx]

	log.info(f" Train: {len(X_tr):,} Val: {len(X_val):,} Test: {len(X_te):,}")

	# ── Scale ─────────────────────────────────────────────────────────────────
	scaler = RobustScaler()
	X_tr_s = scaler.fit_transform(X_tr)
	X_val_s = scaler.transform(X_val)
	X_te_s = scaler.transform(X_te)

	# ── Layer 1: base models ──────────────────────────────────────────────────
	base = make_base_models(use_gpu)
	nm = len(base)
	oof_val = np.zeros((len(X_val), nm * 3), dtype=np.float32)
	oof_te = np.zeros((len(X_te), nm * 3), dtype=np.float32)
	scores = {}

	for idx, (name, model) in enumerate(base.items()):
	log.info(f" [{name}] training on {len(X_tr):,} rows...")
	try:
	if name == "catboost":
	model.fit(X_tr_s, y_tr, eval_set=(X_val_s, y_val), verbose=100)
	elif name == "xgboost":
	model.fit(X_tr_s, y_tr, eval_set=[(X_val_s, y_val)],
	verbose=100)
	elif name == "lightgbm":
	model.fit(X_tr_s, y_tr, eval_set=[(X_val_s, y_val)])
	else:
	model.fit(X_tr_s, y_tr)

	vp = model.predict_proba(X_val_s)
	tp = model.predict_proba(X_te_s)
	oof_val[:, idx3:(idx+1)3] = vp
	oof_te[:, idx3:(idx+1)3] = tp

	acc = float(accuracy_score(y_val, np.argmax(vp, axis=1)))
	scores[name] = round(acc, 4)
	log.info(f" [{name}] val acc = {acc:.4f}")

	except Exception as e:
	log.error(f" [{name}] failed: {e}")
	log.debug(traceback.format_exc())
	oof_val[:, idx3:(idx+1)3] = 1/3
	oof_te[:, idx3:(idx+1)3] = 1/3

	# ── Layer 2: simple probability average ─────────────────────────────────
	# Stacking hurts here: val and test cover different time periods per symbol,
	# so weights learned on val are wrong for test. Simple average wins.
	log.info(f" [ensemble] averaging {len(base)} base models...")
	nm_actual = oof_te.shape[1] // 3
	ens_proba = np.zeros((len(oof_te), 3), dtype=np.float32)
	active = 0
	for mi in range(nm_actual):
	block = oof_te[:, mi3:(mi+1)3]
	if block.std() > 0.001:
	ens_proba += block
	active += 1
	ens_proba = ens_proba / max(active, 1)
	meta_type = "simple_average"
	meta_net_obj = None
	log.info(f" averaged {active} active base models")
	ens_pred = np.argmax(ens_proba, axis=1)
	ens_acc = float(accuracy_score(y_te, ens_pred))
	log.info(f" [ensemble] global test acc = {ens_acc:.4f}")
	log.info(f"\n{classification_report(y_te, ens_pred, target_names=['DOWN','NEUTRAL','UP'])}")

	# ── Per-symbol test accuracy ───────────────────────────────────────────────
	test_syms = sym_ids[te_idx]
	sym_accs = {}
	for sym in np.unique(test_syms):
	mask = test_syms == sym
	if mask.sum() >= 5:
	sym_accs[sym] = round(float(accuracy_score(y_te[mask], ens_pred[mask])), 4)

	# ── Feature importance ────────────────────────────────────────────────────
	top_features = []
	if "xgboost" in base:
	try:
	imp = base["xgboost"].feature_importances_
	top_idx = np.argsort(imp)[-15:][::-1]
	top_features = [[fcols[i], round(float(imp[i]), 4)] for i in top_idx]
	except Exception:
	pass

	# ── Save global model ─────────────────────────────────────────────────────
	gdir = MODELS_DIR / "GLOBAL"
	gdir.mkdir(exist_ok=True)
	joblib.dump(scaler, gdir / "scaler.joblib")
	joblib.dump(fcols, gdir / "feature_cols.joblib")
	for name, model in base.items():
	try:
	joblib.dump(model, gdir / f"{name}.joblib")
	log.info(f" saved {name}.joblib")
	except Exception as e:
	log.warning(f" save {name} failed: {e}")
	if meta_net_obj is not None:
	try:
	import torch
	torch.save(meta_net_obj.state_dict(), gdir / "meta_net.pt")
	with open(gdir / "meta_config.json", "w") as f:
	json.dump({"type": "neural_net", "input_dim": oof_val.shape[1]}, f)
	except Exception:
	pass

	# ── Save test predictions ─────────────────────────────────────────────────
	pd.DataFrame({
	"symbol": test_syms,
	"pred_class": ens_pred,
	"prob_down": ens_proba[:, 0],
	"prob_neutral": ens_proba[:, 1],
	"prob_up": ens_proba[:, 2],
	"actual_class": y_te,
	}).to_parquet(PREDS_DIR / "GLOBAL_p170_predictions.parquet", index=False)

	# ── Metrics ───────────────────────────────────────────────────────────────
	elapsed = round(time.time() - t0, 1)
	cls, cnts = np.unique(y_tr, return_counts=True)

	# top/bottom 10 symbols by accuracy
	sorted_syms = sorted(sym_accs.items(), key=lambda x: x[1], reverse=True)
	top10 = sorted_syms[:10]
	bot10 = sorted_syms[-10:]

	metrics = {
	"model": "GLOBAL",
	"status": "success",
	"timestamp": datetime.now().isoformat(),
	"n_symbols": int(len(np.unique(sym_ids))),
	"n_rows_total": int(len(X)),
	"n_train": int(len(X_tr)),
	"n_val": int(len(X_val)),
	"n_test": int(len(X_te)),
	"n_features": int(len(fcols)),
	"base_models": list(base.keys()),
	"meta_learner_type": meta_type,
	"base_val_accuracy": scores,
	"ensemble_test_accuracy": round(ens_acc, 4),
	"top_features": top_features[:10],
	"top10_symbols": top10,
	"bottom10_symbols": bot10,
	"elapsed_seconds": elapsed,
	"class_distribution_train": {str(int(k)): int(v) for k, v in zip(cls, cnts)},
	}
	with open(METRICS_DIR / "GLOBAL_p170_metrics.json", "w") as f:
	json.dump(metrics, f, indent=2)

	log.info(f"\n{'='*54}")
	log.info(f" GLOBAL MODEL DONE {elapsed:.0f}s")
	log.info(f" Test accuracy: {ens_acc:.4f} ({len(X_te):,} rows across {len(sym_accs)} symbols)")
	log.info(f" Top 5 symbols: {top10[:5]}")
	log.info(f"{'='*54}\n")
	return metrics


	# ─────────────────────────────────────────────────────────────────────────────
	# INFERENCE — predict today using global model
	# ─────────────────────────────────────────────────────────────────────────────

	def predict_today(symbol: str, use_gpu: bool = True) -> Optional[dict]:
	"""Load global model and predict on the latest row for a symbol."""
	import joblib
	gdir = MODELS_DIR / "GLOBAL"
	if not (gdir / "scaler.joblib").exists():
	log.error("No global model found. Run: python p170_max_system.py --global-model")
	return None

	raw = load_candles(symbol)
	if raw is None:
	return None

	df = engineer_features(raw)
	fcols = joblib.load(gdir / "feature_cols.joblib")
	scaler = joblib.load(gdir / "scaler.joblib")

	# Latest row with all features available
	last = df.iloc[-1]
	X = np.zeros((1, len(fcols)), dtype=np.float32)
	for i, fc in enumerate(fcols):
	if fc in last.index and not pd.isna(last[fc]):
	X[0, i] = float(last[fc])
	X_s = scaler.transform(X)

	# Collect base model probas
	base_probas = []
	for name in ["xgboost", "lightgbm", "catboost", "random_forest", "extra_trees"]:
	mp = gdir / f"{name}.joblib"
	if mp.exists():
	try:
	base_probas.append(joblib.load(mp).predict_proba(X_s)[0])
	except Exception:
	pass

	if not base_probas:
	return None

	meta_input = np.concatenate(base_probas).reshape(1, -1).astype(np.float32)
	cfg_path = gdir / "meta_config.json"
	try:
	import torch
	if cfg_path.exists():
	with open(cfg_path) as f:
	cfg = json.load(f)
	net = _meta_net(cfg["input_dim"])
	net.load_state_dict(torch.load(gdir / "meta_net.pt", map_location="cpu"))
	net.eval()
	device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
	proba = predict_meta_torch(net, meta_input, device)[0]
	else:
	proba = np.mean(base_probas, axis=0)
	except Exception:
	proba = np.mean(base_probas, axis=0)

	pred = int(np.argmax(proba))
	label = {0: "DOWN", 1: "NEUTRAL", 2: "UP"}[pred]
	return {
	"symbol": symbol, "date": str(last.get("date", "latest")),
	"prediction": label,
	"prob_down": round(float(proba[0]), 4),
	"prob_neutral": round(float(proba[1]), 4),
	"prob_up": round(float(proba[2]), 4),
	"confidence": round(float(proba.max()), 4),
	"rsi_14": round(float(last.get("rsi_14", np.nan)), 2),
	"macd_hist": round(float(last.get("macd_hist", np.nan)), 4),
	"adx_14": round(float(last.get("adx_14", np.nan)), 2),
	"ema_align": int(last.get("ema_align", -1)),
	}


	# ─────────────────────────────────────────────────────────────────────────────
	# LEGACY: per-symbol batch (kept for compatibility, but global model is better)
	# ─────────────────────────────────────────────────────────────────────────────

	def run_batch(pairs: List[Tuple[str, Path]], use_gpu: bool = True) -> dict:
	"""Thin wrapper — just calls train_global_model with all pairs."""
	return train_global_model(use_gpu=use_gpu, max_symbols=len(pairs))


	# ─────────────────────────────────────────────────────────────────────────────
	# DEPENDENCY CHECK
	# ─────────────────────────────────────────────────────────────────────────────

	def check_deps(use_gpu: bool):
	print("\n=== P170 Dependency Check ===\n")
	for name, mod in [("numpy","numpy"),("pandas","pandas"),
	("scikit-learn","sklearn"),("xgboost","xgboost"),("joblib","joblib")]:
	try: __import__(mod); print(f" ✓ {name}")
	except: print(f" ✗ {name} (pip install {name} --break-system-packages)")
	for name, mod in [("lightgbm","lightgbm"),("catboost","catboost"),("optuna","optuna")]:
	try: __import__(mod); print(f" ✓ {name} (optional)")
	except: print(f" ~ {name} (optional — pip install {name} --break-system-packages)")
	try:
	import torch
	cuda = torch.cuda.is_available()
	dev = torch.cuda.get_device_name(0) if cuda else "CPU only"
	print(f" ✓ pytorch ({dev})")
	if cuda:
	print(f" GPU memory: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB")
	except ImportError:
	print(" ~ pytorch (meta-learner falls back to LogisticRegression without it)")

	pairs = discover_symbols()
	print(f"\nPaths:")
	print(f" Clean data : {CLEAN_DIR} exists={CLEAN_DIR.exists()}")
	print(f" Features : {FEAT_DIR} exists={FEAT_DIR.exists()}")
	print(f" Output : {OUT}")
	print(f"\nSymbols found: {len(pairs)}")
	if pairs:
	print(" Largest 10 (most data first):")
	for sym, path in pairs[:10]:
	print(f" {sym:20s} {path.stat().st_size//1024:6d} KB {path.name}")
	print()


	# ─────────────────────────────────────────────────────────────────────────────
	# CLI
	# ─────────────────────────────────────────────────────────────────────────────

	def main():
	p = argparse.ArgumentParser(description="P170 MAX SYSTEM")
	g = p.add_mutually_exclusive_group(required=True)
	g.add_argument("--global-model", action="store_true", help="Train global model on all 2150 symbols combined (recommended)")
	g.add_argument("--all-symbols", action="store_true", help="Alias for --global-model")
	g.add_argument("--predict", type=str, help="Predict today for a symbol using global model")
	g.add_argument("--check-deps", action="store_true", help="Check dependencies and paths")
	p.add_argument("--top-n", type=int, default=None, help="Limit to top N symbols by data size (useful for quick tests)")
	p.add_argument("--no-gpu", action="store_true", help="Force CPU")
	args = p.parse_args()
	gpu = not args.no_gpu

	if args.check_deps:
	check_deps(gpu)
	elif args.global_model or args.all_symbols:
	m = train_global_model(use_gpu=gpu, max_symbols=args.top_n)
	print(json.dumps(m, indent=2, default=str))
	elif args.predict:
	r = predict_today(args.predict.upper(), use_gpu=gpu)
	print(json.dumps(r, indent=2) if r else f"No global model found. Run --global-model first.")


	if __name__ == "__main__":
	main()