import os, json, math, pickle from datetime import datetime, timedelta import numpy as np import pandas as pd import yfinance as yf from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_absolute_error, mean_squared_error import torch from torch.utils.data import TensorDataset, DataLoader, random_split import torch.nn as nn import torch.optim as optim from models import StockLSTM os.environ["CUDA_VISIBLE_DEVICES"] = "" ARTIFACTS_DIR = "artifacts" os.makedirs(ARTIFACTS_DIR, exist_ok=True) def fetch_data(symbol: str, start: str = None, end: str = None) -> pd.DataFrame: if end is None: end = datetime.utcnow().date().isoformat() if start is None: start = (datetime.utcnow().date() - timedelta(days=5*365)).isoformat() df = yf.download(symbol, start=start, end=end, progress=False, auto_adjust=True) if df.empty: raise ValueError(f"No data for symbol {symbol}") return df[['Close']].dropna() def make_sequences(values: np.ndarray, seq_len: int): X, y = [], [] for i in range(seq_len, len(values)): X.append(values[i-seq_len:i]) y.append(values[i]) X = np.array(X) # [N, T, 1] y = np.array(y) # [N, 1] return X, y def to_tensor_loader(X, y, batch_size=32): X_t = torch.from_numpy(X).float() y_t = torch.from_numpy(y).float() ds = TensorDataset(X_t, y_t) return ds def train(symbol: str, seq_len: int = 60, epochs: int = 5, batch_size: int = 32, start: str = None, end: str = None, lr: float = 1e-2): device = torch.device("cpu") # --- data --- df = fetch_data(symbol, start, end) data = df['Close'].values.reshape(-1, 1) scaler = MinMaxScaler((0, 1)) scaled = scaler.fit_transform(data) split_idx = int(len(scaled) * 0.8) train_scaled, test_scaled = scaled[:split_idx], scaled[split_idx:] X_train, y_train = make_sequences(train_scaled, seq_len) # Ensure continuity at split boundary X_test_like_train, y_test_like_train = make_sequences( np.vstack([train_scaled[-seq_len:], test_scaled]), seq_len ) # Train/val split on the training portion full_train_ds = to_tensor_loader(X_train, y_train) val_size = max(1, int(0.1 * len(full_train_ds))) train_size = len(full_train_ds) - val_size train_ds, val_ds = random_split(full_train_ds, [train_size, val_size]) train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False) # --- model --- model = StockLSTM(input_dim=1, hidden_dim=64, num_layers=2, dropout=0.2).to(device) criterion = nn.MSELoss() optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay = 0.05) # optimizer = optim.SGD(model.parameters(), lr=lr) # --- training --- model.train() for ep in range(epochs): train_loss = 0.0 for xb, yb in train_loader: xb, yb = xb.to(device), yb.to(device) optimizer.zero_grad() pred = model(xb) loss = criterion(pred, yb) loss.backward() optimizer.step() train_loss += loss.item() * xb.size(0) train_loss /= len(train_loader.dataset) # quick val model.eval() val_loss = 0.0 with torch.no_grad(): for xb, yb in val_loader: xb, yb = xb.to(device), yb.to(device) pred = model(xb) val_loss += criterion(pred, yb).item() * xb.size(0) val_loss /= len(val_loader.dataset) model.train() # --- evaluation on held-out tail (like test) in original scale --- model.eval() with torch.no_grad(): X_t = torch.from_numpy(X_test_like_train).float().to(device) preds_scaled = model(X_t).cpu().numpy() # scaled space preds = scaler.inverse_transform(preds_scaled) y_true = scaler.inverse_transform(y_test_like_train) rmse = math.sqrt(mean_squared_error(y_true, preds)) mae = mean_absolute_error(y_true, preds) # --- save artifacts --- base = os.path.join(ARTIFACTS_DIR, symbol.upper()) os.makedirs(base, exist_ok=True) model_path = os.path.join(base, "model.pt") scaler_path = os.path.join(base, "scaler.pkl") meta_path = os.path.join(base, "meta.json") torch.save(model.state_dict(), model_path) with open(scaler_path, "wb") as f: pickle.dump(scaler, f) with open(meta_path, "w") as f: json.dump({ "symbol": symbol.upper(), "seq_len": seq_len, "epochs": epochs, "batch_size": batch_size, "train_size": split_idx, "timestamps": { "start": df.index.min().strftime("%Y-%m-%d"), "end": df.index.max().strftime("%Y-%m-%d"), "trained_at_utc": datetime.utcnow().isoformat() }, "metrics": {"rmse": rmse, "mae": mae} }, f, indent=2) return {"rmse": rmse, "mae": mae, "rows": len(df), "symbol": symbol.upper()}