Spaces:
Runtime error
Runtime error
File size: 6,250 Bytes
d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 34e94cf d5b7ee9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | #!/usr/bin/env python3
"""
Generate training dataset for AI Fusion strategy.
Fetches historical OHLCV, computes technical features, and labels data.
Includes future returns for Profit/Loss backtesting.
"""
import sys
import os
import pandas as pd
import numpy as np
import logging
import torch
from tqdm.auto import tqdm
# Add project root to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from trading_cli.data.market import fetch_ohlcv_yfinance
from trading_cli.strategy.signals import (
calculate_rsi,
calculate_sma,
calculate_atr,
calculate_bollinger_bands
)
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
SYMBOLS = [
"AAPL", "MSFT", "GOOGL", "AMZN", "TSLA", "NVDA", "AMD", "META", "NFLX", "ADBE",
"CRM", "INTC", "CSCO", "ORCL", "QCOM", "AVGO", "TXN", "AMAT", "MU", "LRCX",
"JPM", "BAC", "WFC", "GS", "MS", "V", "MA", "AXP", "BLK", "BX",
"XOM", "CVX", "COP", "SLB", "HAL", "MPC", "PSX", "VLO", "OXY", "HES",
"JNJ", "PFE", "UNH", "ABBV", "MRK", "LLY", "TMO", "DHR", "ISRG", "GILD",
"WMT", "COST", "HD", "LOW", "TGT", "PG", "KO", "PEP", "PM", "MO",
"CAT", "DE", "HON", "GE", "MMM", "UPS", "FDX", "RTX", "LMT", "GD",
"BTC-USD", "ETH-USD", "GC=F", "CL=F"
]
DAYS = 3652 # 10 years
LOOKAHEAD = 5 # Prediction window (days)
TARGET_PCT = 0.02 # Profit target (2%)
STOP_PCT = 0.015 # Stop loss (1.5%)
SEQ_LEN = 30 # One month of trading days
def generate_features(df):
"""Compute technical indicators for the feature vector."""
close = df["close" if "close" in df.columns else "Close"]
# 1. RSI(2) - Very short period
rsi2 = calculate_rsi(close, 2) / 100.0
# 2. RSI(14) - Standard period
rsi14 = calculate_rsi(close, 14) / 100.0
# 3. SMA distance (20, 50, 200)
sma20 = calculate_sma(close, 20)
sma50 = calculate_sma(close, 50)
sma200 = calculate_sma(close, 200)
dist_sma20 = (close / sma20) - 1.0
dist_sma50 = (close / sma50) - 1.0
dist_sma200 = (close / sma200) - 1.0
# 4. Bollinger Band position
upper, mid, lower = calculate_bollinger_bands(close, 20, 2.0)
bb_pos = (close - lower) / (upper - lower + 1e-6)
# 5. ATR (Volatility)
atr = calculate_atr(df, 14)
atr_pct = atr / close
# 6. Volume spike
vol = df["volume" if "volume" in df.columns else "Volume"]
vol_sma = vol.rolling(20).mean()
vol_ratio = (vol / vol_sma).clip(0, 5) / 5.0
features = pd.DataFrame({
"rsi2": rsi2,
"rsi14": rsi14,
"dist_sma20": dist_sma20,
"dist_sma50": dist_sma50,
"dist_sma200": dist_sma200,
"bb_pos": bb_pos,
"atr_pct": atr_pct,
"vol_ratio": vol_ratio,
}, index=df.index)
return features.dropna()
def generate_labels(df):
"""Label data using Triple Barrier and calculate future returns."""
close = df["close" if "close" in df.columns else "Close"].values
labels = np.zeros(len(close))
future_rets = np.zeros(len(close))
for i in range(len(close) - LOOKAHEAD):
current_price = close[i]
future_prices = close[i+1 : i+LOOKAHEAD+1]
max_ret = (np.max(future_prices) - current_price) / current_price
min_ret = (np.min(future_prices) - current_price) / current_price
if max_ret >= TARGET_PCT:
labels[i] = 1 # BUY
elif min_ret <= -STOP_PCT:
labels[i] = 2 # SELL
else:
labels[i] = 0 # HOLD
future_rets[i] = (close[i + LOOKAHEAD] - current_price) / current_price
return labels, future_rets
def build_dataset(symbols=SYMBOLS, days=DAYS, output_path="data/trading_dataset.pt"):
"""Fetch, label, and sequence data for all symbols."""
all_X, all_y, all_rets = [], [], []
for symbol in tqdm(symbols, desc="Building Global Dataset"):
try:
df = fetch_ohlcv_yfinance(symbol, days=days)
if len(df) < (SEQ_LEN + LOOKAHEAD + 50):
continue
features = generate_features(df)
labels, rets = generate_labels(df)
# Align features with labels/rets and add sentiment
df_aligned = pd.DataFrame(index=df.index)
df_aligned["label"] = labels
df_aligned["future_ret"] = rets
df_aligned["sentiment"] = np.random.normal(0, 0.2, len(df))
# Merge features
df_combined = features.join(df_aligned, how="inner").dropna()
if len(df_combined) < SEQ_LEN:
continue
feat_vals = df_combined.drop(columns=["label", "future_ret"]).values
label_vals = df_combined["label"].values.astype(int)
ret_vals = df_combined["future_ret"].values
symbol_X, symbol_y, symbol_rets = [], [], []
for i in range(len(feat_vals) - SEQ_LEN):
symbol_X.append(feat_vals[i : i+SEQ_LEN])
# Label/Ret is for the prediction point at the END of the sequence
symbol_y.append(label_vals[i+SEQ_LEN-1])
symbol_rets.append(ret_vals[i+SEQ_LEN-1])
if symbol_X:
all_X.append(np.array(symbol_X))
all_y.append(np.array(symbol_y))
all_rets.append(np.array(symbol_rets))
except Exception as e:
logger.error(f"Error processing {symbol}: {e}")
if not all_X:
logger.error("No valid data collected!")
return None
X = np.concatenate(all_X, axis=0)
y = np.concatenate(all_y, axis=0)
rets = np.concatenate(all_rets, axis=0)
data = {
"X": torch.tensor(X, dtype=torch.float32),
"y": torch.tensor(y, dtype=torch.long),
"rets": torch.tensor(rets, dtype=torch.float32),
"symbols": symbols
}
os.makedirs(os.path.dirname(output_path), exist_ok=True)
torch.save(data, output_path)
logger.info(f"✅ Dataset saved to {output_path} | Shape: {X.shape}")
return data
if __name__ == "__main__":
build_dataset()
|