Spaces:

will702
/

stockpro-ml

Running

App Files Files Community

stockpro-ml / scripts /train.py

will702

StockPro ML backend with pytorch-forecasting TFT

9334ec6 5 days ago

raw

history blame contribute delete

11.6 kB

	"""
	Weekly retraining script for the TFT stock predictor.
	Run: python -m scripts.train

	Trains on all IDX tickers fetched from IndoPremier.
	Uses multi-horizon quantile loss (pinball loss) for 3 quantiles.
	Saves best model to models/tft_stock.pt and uploads to HF Hub.
	"""
	import os
	import sys
	import numpy as np
	import torch
	import torch.nn as nn
	from torch.utils.data import DataLoader, TensorDataset

	sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

	from app.services.data_fetcher import IDX_TICKERS, fetch_ohlcv
	from app.services.feature_engineer import (
	build_features,
	make_sequences,
	SEQUENCE_LEN,
	FORECAST_HORIZON,
	N_FEATURES,
	)
	from app.services.concept_drift import (
	extract_snapshots_from_series,
	SNAPSHOT_DIM,
	K_HISTORY,
	)
	from app.models.tft_predictor import StockTFT
	from app.models.ddg_da import DriftPredictorMLP

	MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "models")
	MODEL_PATH = os.path.join(MODEL_DIR, "tft_stock.pt")
	os.makedirs(MODEL_DIR, exist_ok=True)

	EPOCHS = 50
	BATCH_SIZE = 32
	LR = 5e-4
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


	def quantile_loss(
	preds: torch.Tensor,
	targets: torch.Tensor,
	quantiles: list[float] = [0.1, 0.5, 0.9],
	) -> torch.Tensor:
	"""
	Pinball (quantile) loss.

	preds: (batch, horizon, n_quantiles)
	targets: (batch, horizon)
	"""
	total = torch.tensor(0.0, device=preds.device)
	for i, q in enumerate(quantiles):
	errors = targets - preds[:, :, i]
	total += torch.where(errors >= 0, q * errors, (q - 1) * errors).mean()
	return total / len(quantiles)


	def make_multihorizon_targets(close_norm: np.ndarray, horizon: int) -> np.ndarray:
	"""
	For each time step t, target is close_norm[t+1 : t+1+horizon].
	Returns (T - horizon, horizon) array.
	"""
	targets = []
	for i in range(len(close_norm) - horizon):
	targets.append(close_norm[i + 1 : i + 1 + horizon])
	return np.array(targets, dtype=np.float32)


	def collect_training_data() -> tuple[np.ndarray, np.ndarray]:
	all_X, all_y = [], []
	print(f"Fetching data for {len(IDX_TICKERS)} IDX tickers from IndoPremier...")

	for i, ticker in enumerate(IDX_TICKERS):
	data = fetch_ohlcv(ticker, period="5y")
	if data is None:
	print(f" [{i+1}/{len(IDX_TICKERS)}] {ticker}: no data, skipping")
	continue

	features = build_features(data["closes"], data["volumes"], data["timestamps"])

	# close_norm is feature column 0
	close_norm = features[:, 0]
	multihorizon_targets = make_multihorizon_targets(close_norm, FORECAST_HORIZON)

	# Align: features[:-FORECAST_HORIZON] → multihorizon_targets
	aligned_features = features[: len(multihorizon_targets)]
	X, y = make_sequences(aligned_features, multihorizon_targets, SEQUENCE_LEN)

	if len(X) == 0:
	print(f" [{i+1}/{len(IDX_TICKERS)}] {ticker}: too short, skipping")
	continue

	all_X.append(X)
	all_y.append(y)
	print(f" [{i+1}/{len(IDX_TICKERS)}] {ticker}: {len(X)} sequences")

	if not all_X:
	raise RuntimeError("No training data collected")

	return np.concatenate(all_X), np.concatenate(all_y)


	def train():
	print(f"Training TFT on {DEVICE}")
	X, y = collect_training_data()
	print(f"Total sequences: {len(X)}, features: {N_FEATURES}, horizon: {FORECAST_HORIZON}")

	idx = np.random.permutation(len(X))
	X, y = X[idx], y[idx]
	split = int(len(X) * 0.9)
	X_train, X_val = X[:split], X[split:]
	y_train, y_val = y[:split], y[split:]

	train_ds = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
	val_ds = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
	train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
	val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE)

	model = StockTFT(input_size=N_FEATURES, forecast_horizon=FORECAST_HORIZON).to(DEVICE)
	opt = torch.optim.Adam(model.parameters(), lr=LR)
	scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=EPOCHS)

	best_val_loss = float("inf")
	for epoch in range(1, EPOCHS + 1):
	model.train()
	train_loss = 0.0
	for xb, yb in train_dl:
	xb, yb = xb.to(DEVICE), yb.to(DEVICE)
	opt.zero_grad()
	preds, _ = model(xb) # (B, HORIZON, 3)
	loss = quantile_loss(preds, yb)
	loss.backward()
	nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	opt.step()
	train_loss += loss.item() * len(xb)
	train_loss /= len(X_train)

	model.eval()
	val_loss = 0.0
	with torch.no_grad():
	for xb, yb in val_dl:
	xb, yb = xb.to(DEVICE), yb.to(DEVICE)
	preds, _ = model(xb)
	val_loss += quantile_loss(preds, yb).item() * len(xb)
	val_loss /= len(X_val)
	scheduler.step()

	print(f"Epoch {epoch:2d}/{EPOCHS} train={train_loss:.4f} val={val_loss:.4f}")

	if val_loss < best_val_loss:
	best_val_loss = val_loss
	torch.save(model.state_dict(), MODEL_PATH)
	print(f" ✓ Saved best model (val={val_loss:.4f})")

	print(f"\nTraining complete. Best val loss: {best_val_loss:.4f}")
	print(f"Model saved to {MODEL_PATH}")
	_upload_to_hub(MODEL_PATH)


	def _upload_to_hub(model_path: str) -> None:
	from app.config import MODEL_REPO, HF_TOKEN
	if not MODEL_REPO or not HF_TOKEN:
	print("HF_MODEL_REPO / HF_TOKEN not set — skipping HF Hub upload.")
	return
	try:
	from huggingface_hub import HfApi
	api = HfApi(token=HF_TOKEN)
	api.create_repo(repo_id=MODEL_REPO, repo_type="model", exist_ok=True, private=True)
	api.upload_file(
	path_or_fileobj=model_path,
	path_in_repo="tft_stock.pt",
	repo_id=MODEL_REPO,
	repo_type="model",
	commit_message="Weekly TFT retrain via GitHub Actions",
	)
	print(f"Model uploaded to HF Hub: {MODEL_REPO}/tft_stock.pt")
	except Exception as e:
	print(f"HF Hub upload failed: {e}")


	# ── DDG-DA training ───────────────────────────────────────────────────────────

	DDG_DA_PATH = os.path.join(MODEL_DIR, "ddg_da.pt")
	DDG_DA_EPOCHS = 30
	DDG_DA_BATCH = 256
	DDG_DA_LR = 1e-3


	def collect_drift_snapshots() -> list[np.ndarray]:
	"""
	Return a list of per-ticker (K_ticker, SNAPSHOT_DIM) snapshot arrays.
	Cross-ticker boundaries are kept separate to avoid contaminated training pairs.
	"""
	per_ticker: list[np.ndarray] = []
	print(f"Collecting drift snapshots for {len(IDX_TICKERS)} tickers...")
	for i, ticker in enumerate(IDX_TICKERS):
	data = fetch_ohlcv(ticker, period="5y")
	if data is None:
	continue
	features = build_features(data["closes"], data["volumes"], data["timestamps"])
	snaps = extract_snapshots_from_series(features)
	if len(snaps) >= K_HISTORY + 1: # need at least K+1 for one (X,y) pair
	per_ticker.append(snaps)
	if (i + 1) % 100 == 0:
	print(f" Snapshots: {i+1}/{len(IDX_TICKERS)} tickers processed, {len(per_ticker)} valid")
	print(f" Collected {len(per_ticker)} tickers with sufficient snapshot history")
	return per_ticker


	def build_ddg_da_dataset(
	per_ticker_snapshots: list[np.ndarray],
	) -> tuple[np.ndarray, np.ndarray]:
	"""
	Build (X, y) pairs for DDG-DA MLP training.
	Window slides WITHIN each ticker's snapshots to avoid cross-ticker contamination.

	X: (N, K_HISTORY * SNAPSHOT_DIM)
	y: (N, SNAPSHOT_DIM)
	"""
	all_X, all_y = [], []
	for snaps in per_ticker_snapshots:
	k = len(snaps)
	for i in range(k - K_HISTORY):
	x_window = snaps[i : i + K_HISTORY].flatten() # (K * 44,)
	y_target = snaps[i + K_HISTORY] # (44,)
	all_X.append(x_window)
	all_y.append(y_target)
	if not all_X:
	raise RuntimeError("No DDG-DA training pairs collected")
	return np.array(all_X, dtype=np.float32), np.array(all_y, dtype=np.float32)


	def train_ddg_da(per_ticker_snapshots: list[np.ndarray]) -> None:
	print(f"\n--- Training DDG-DA drift predictor (device={DEVICE}) ---")
	X, y = build_ddg_da_dataset(per_ticker_snapshots)
	print(f" DDG-DA training pairs: {len(X)}, snapshot_dim={SNAPSHOT_DIM}, k_history={K_HISTORY}")

	idx = np.random.permutation(len(X))
	X, y = X[idx], y[idx]
	split = int(len(X) * 0.9)
	X_train, X_val = X[:split], X[split:]
	y_train, y_val = y[:split], y[split:]

	train_ds = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
	val_ds = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
	train_dl = DataLoader(train_ds, batch_size=DDG_DA_BATCH, shuffle=True)
	val_dl = DataLoader(val_ds, batch_size=DDG_DA_BATCH)

	mlp = DriftPredictorMLP(k_history=K_HISTORY, snapshot_dim=SNAPSHOT_DIM).to(DEVICE)
	opt = torch.optim.Adam(mlp.parameters(), lr=DDG_DA_LR)
	scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=DDG_DA_EPOCHS)
	criterion = nn.MSELoss()

	best_val = float("inf")
	for epoch in range(1, DDG_DA_EPOCHS + 1):
	mlp.train()
	train_loss = 0.0
	for xb, yb in train_dl:
	xb, yb = xb.to(DEVICE), yb.to(DEVICE)
	opt.zero_grad()
	pred = mlp(xb)
	loss = criterion(pred, yb)
	loss.backward()
	nn.utils.clip_grad_norm_(mlp.parameters(), 1.0)
	opt.step()
	train_loss += loss.item() * len(xb)
	train_loss /= len(X_train)

	mlp.eval()
	val_loss = 0.0
	with torch.no_grad():
	for xb, yb in val_dl:
	xb, yb = xb.to(DEVICE), yb.to(DEVICE)
	val_loss += criterion(mlp(xb), yb).item() * len(xb)
	val_loss /= len(X_val)
	scheduler.step()

	print(f" DDG-DA Epoch {epoch:2d}/{DDG_DA_EPOCHS} train={train_loss:.6f} val={val_loss:.6f}")

	if val_loss < best_val:
	best_val = val_loss
	torch.save(mlp.state_dict(), DDG_DA_PATH)
	print(f" ✓ Saved best DDG-DA model (val={val_loss:.6f})")

	print(f"DDG-DA training complete. Saved to {DDG_DA_PATH}")
	_upload_ddg_da_to_hub(DDG_DA_PATH)


	def _upload_ddg_da_to_hub(model_path: str) -> None:
	from app.config import MODEL_REPO, HF_TOKEN
	if not MODEL_REPO or not HF_TOKEN:
	print("HF_MODEL_REPO / HF_TOKEN not set — skipping DDG-DA HF Hub upload.")
	return
	try:
	from huggingface_hub import HfApi
	api = HfApi(token=HF_TOKEN)
	api.upload_file(
	path_or_fileobj=model_path,
	path_in_repo="ddg_da.pt",
	repo_id=MODEL_REPO,
	repo_type="model",
	commit_message="Weekly DDG-DA retrain via GitHub Actions",
	)
	print(f"DDG-DA model uploaded to HF Hub: {MODEL_REPO}/ddg_da.pt")
	except Exception as e:
	print(f"DDG-DA HF Hub upload failed: {e}")


	# ── Entry point ───────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	train()
	# Train DDG-DA drift predictor after TFT
	snapshots = collect_drift_snapshots()
	train_ddg_da(snapshots)