StockSenseSpace / train.py
amitke
original
f3bda49
import os, json, math, pickle
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
from models import StockLSTM
os.environ["CUDA_VISIBLE_DEVICES"] = ""
ARTIFACTS_DIR = "artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)
def fetch_data(symbol: str, start: str = None, end: str = None) -> pd.DataFrame:
if end is None:
end = datetime.utcnow().date().isoformat()
if start is None:
start = (datetime.utcnow().date() - timedelta(days=5*365)).isoformat()
df = yf.download(symbol, start=start, end=end, progress=False, auto_adjust=True)
if df.empty:
raise ValueError(f"No data for symbol {symbol}")
return df[['Close']].dropna()
def make_sequences(values: np.ndarray, seq_len: int):
X, y = [], []
for i in range(seq_len, len(values)):
X.append(values[i-seq_len:i])
y.append(values[i])
X = np.array(X) # [N, T, 1]
y = np.array(y) # [N, 1]
return X, y
def to_tensor_loader(X, y, batch_size=32):
X_t = torch.from_numpy(X).float()
y_t = torch.from_numpy(y).float()
ds = TensorDataset(X_t, y_t)
return ds
def train(symbol: str, seq_len: int = 60, epochs: int = 5, batch_size: int = 32,
start: str = None, end: str = None, lr: float = 1e-2):
device = torch.device("cpu")
# --- data ---
df = fetch_data(symbol, start, end)
data = df['Close'].values.reshape(-1, 1)
scaler = MinMaxScaler((0, 1))
scaled = scaler.fit_transform(data)
split_idx = int(len(scaled) * 0.8)
train_scaled, test_scaled = scaled[:split_idx], scaled[split_idx:]
X_train, y_train = make_sequences(train_scaled, seq_len)
# Ensure continuity at split boundary
X_test_like_train, y_test_like_train = make_sequences(
np.vstack([train_scaled[-seq_len:], test_scaled]), seq_len
)
# Train/val split on the training portion
full_train_ds = to_tensor_loader(X_train, y_train)
val_size = max(1, int(0.1 * len(full_train_ds)))
train_size = len(full_train_ds) - val_size
train_ds, val_ds = random_split(full_train_ds, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
# --- model ---
model = StockLSTM(input_dim=1, hidden_dim=64, num_layers=2, dropout=0.2).to(device)
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay = 0.05)
# optimizer = optim.SGD(model.parameters(), lr=lr)
# --- training ---
model.train()
for ep in range(epochs):
train_loss = 0.0
for xb, yb in train_loader:
xb, yb = xb.to(device), yb.to(device)
optimizer.zero_grad()
pred = model(xb)
loss = criterion(pred, yb)
loss.backward()
optimizer.step()
train_loss += loss.item() * xb.size(0)
train_loss /= len(train_loader.dataset)
# quick val
model.eval()
val_loss = 0.0
with torch.no_grad():
for xb, yb in val_loader:
xb, yb = xb.to(device), yb.to(device)
pred = model(xb)
val_loss += criterion(pred, yb).item() * xb.size(0)
val_loss /= len(val_loader.dataset)
model.train()
# --- evaluation on held-out tail (like test) in original scale ---
model.eval()
with torch.no_grad():
X_t = torch.from_numpy(X_test_like_train).float().to(device)
preds_scaled = model(X_t).cpu().numpy() # scaled space
preds = scaler.inverse_transform(preds_scaled)
y_true = scaler.inverse_transform(y_test_like_train)
rmse = math.sqrt(mean_squared_error(y_true, preds))
mae = mean_absolute_error(y_true, preds)
# --- save artifacts ---
base = os.path.join(ARTIFACTS_DIR, symbol.upper())
os.makedirs(base, exist_ok=True)
model_path = os.path.join(base, "model.pt")
scaler_path = os.path.join(base, "scaler.pkl")
meta_path = os.path.join(base, "meta.json")
torch.save(model.state_dict(), model_path)
with open(scaler_path, "wb") as f:
pickle.dump(scaler, f)
with open(meta_path, "w") as f:
json.dump({
"symbol": symbol.upper(),
"seq_len": seq_len,
"epochs": epochs,
"batch_size": batch_size,
"train_size": split_idx,
"timestamps": {
"start": df.index.min().strftime("%Y-%m-%d"),
"end": df.index.max().strftime("%Y-%m-%d"),
"trained_at_utc": datetime.utcnow().isoformat()
},
"metrics": {"rmse": rmse, "mae": mae}
}, f, indent=2)
return {"rmse": rmse, "mae": mae, "rows": len(df), "symbol": symbol.upper()}