Spaces:
Sleeping
Sleeping
File size: 7,379 Bytes
514c4c0 09fbd2c 514c4c0 d56edb9 514c4c0 1e90053 95aa7b7 514c4c0 09fbd2c 514c4c0 f3bda49 5143bce 514c4c0 09fbd2c 514c4c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import os, json, math, pickle
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import yfinance as yf
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
from models import StockLSTM
os.environ["CUDA_VISIBLE_DEVICES"] = ""
ARTIFACTS_DIR = "artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)
def fetch_data(symbol: str, start: str = None, end: str = None) -> pd.DataFrame:
if end is None:
end = datetime.utcnow().date().isoformat()
if start is None:
start = (datetime.utcnow().date() - timedelta(days=5*365)).isoformat()
df = yf.download(symbol, start=start, end=end, progress=False, auto_adjust=True)
if df.empty:
raise ValueError(f"No data for symbol {symbol}")
return df[['Close']].dropna()
def make_sequences(values: np.ndarray, seq_len: int):
X, y = [], []
for i in range(seq_len, len(values)):
X.append(values[i-seq_len:i])
y.append(values[i])
X = np.array(X) # [N, T, 1]
y = np.array(y) # [N, 1]
return X, y
def to_tensor_loader(X, y, batch_size=32):
X_t = torch.from_numpy(X).float()
y_t = torch.from_numpy(y).float()
ds = TensorDataset(X_t, y_t)
return ds
def train(symbol: str, seq_len: int = 60, epochs: int = 5, batch_size: int = 32,
start: str = None, end: str = None, lr: float = 1e-2):
device = torch.device("cpu")
# --- data ---
df = fetch_data(symbol, start, end)
# Calculate Log Returns: ln(Pt / Pt-1)
# This makes the data stationary and solves scaling issues with absolute prices
df['LogReturn'] = np.log(df['Close'] / df['Close'].shift(1))
df = df.dropna()
data = df['LogReturn'].values.reshape(-1, 1)
# Use StandardScaler for returns (centered around 0)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled = scaler.fit_transform(data)
split_idx = int(len(scaled) * 0.8)
train_scaled, test_scaled = scaled[:split_idx], scaled[split_idx:]
X_train, y_train = make_sequences(train_scaled, seq_len)
# Ensure continuity at split boundary
X_test_like_train, y_test_like_train = make_sequences(
np.vstack([train_scaled[-seq_len:], test_scaled]), seq_len
)
# Train/val split on the training portion
full_train_ds = to_tensor_loader(X_train, y_train)
val_size = max(1, int(0.1 * len(full_train_ds)))
train_size = len(full_train_ds) - val_size
train_ds, val_ds = random_split(full_train_ds, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
# --- model ---
model = StockLSTM(input_dim=1, hidden_dim=64, num_layers=2, dropout=0.2).to(device)
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay = 0.05)
# optimizer = optim.SGD(model.parameters(), lr=lr)
# --- training ---
model.train()
for ep in range(epochs):
train_loss = 0.0
for xb, yb in train_loader:
xb, yb = xb.to(device), yb.to(device)
optimizer.zero_grad()
pred = model(xb)
loss = criterion(pred, yb)
loss.backward()
optimizer.step()
train_loss += loss.item() * xb.size(0)
train_loss /= len(train_loader.dataset)
# quick val
model.eval()
val_loss = 0.0
with torch.no_grad():
for xb, yb in val_loader:
xb, yb = xb.to(device), yb.to(device)
pred = model(xb)
val_loss += criterion(pred, yb).item() * xb.size(0)
val_loss /= len(val_loader.dataset)
model.train()
# --- evaluation on held-out tail (like test) in original scale ---
model.eval()
with torch.no_grad():
X_t = torch.from_numpy(X_test_like_train).float().to(device)
preds_scaled = model(X_t).cpu().numpy() # scaled log-returns
# Inverse transform to get actual log-returns
preds_returns = scaler.inverse_transform(preds_scaled).flatten()
y_true_returns = scaler.inverse_transform(y_test_like_train).flatten()
# Reconstruct Prices
# We need the reference price just before the test sequence started
# The 'test_scaled' starts at split_idx.
# The corresponding Price index in df is also split_idx (after dropna for shift).
# actually, X_test_like_train covers the test set.
# We need the price at (split_idx - 1) as the base for the first return in test set.
# Get original prices corresponding to the test set
# The test set indices in 'data' start at split_idx
# So the price at split_idx corresponds to the first return in test_scaled
# Price[t] = Price[t-1] * exp(Return[t])
test_prices = df['Close'].values[split_idx:]
# validation: len(test_prices) should equal len(y_test_like_train)
# But make_sequences consumes 'seq_len' from the start.
# X_test_like_train was built from [train_scaled[-seq_len:], test_scaled]
# So it actually produces predictions for ALL of test_scaled.
# Let's reconstruct systematically:
# We need the price that precedes the first prediction.
# The first target in y_test_like_train corresponds to `test_scaled[0]`.
# The price for that is df['Close'].iloc[split_idx].
# The PREVIOUS price is df['Close'].iloc[split_idx - 1].
base_price = df['Close'].iloc[split_idx - 1]
reconstructed_preds = []
curr = base_price
for r in preds_returns:
curr = curr * np.exp(r)
reconstructed_preds.append(curr)
# We can perform the same for standard checks, or just compare against actual test prices
# actual test prices:
actual_prices = df['Close'].iloc[split_idx:].values
# Truncate if lengths differ (rare with this logic but good for safety)
min_len = min(len(reconstructed_preds), len(actual_prices))
preds = np.array(reconstructed_preds[:min_len])
y_true = actual_prices[:min_len]
rmse = math.sqrt(mean_squared_error(y_true, preds))
mae = mean_absolute_error(y_true, preds)
# --- save artifacts ---
base = os.path.join(ARTIFACTS_DIR, symbol.upper())
os.makedirs(base, exist_ok=True)
model_path = os.path.join(base, "model.pt")
scaler_path = os.path.join(base, "scaler.pkl")
meta_path = os.path.join(base, "meta.json")
torch.save(model.state_dict(), model_path)
with open(scaler_path, "wb") as f:
pickle.dump(scaler, f)
with open(meta_path, "w") as f:
json.dump({
"symbol": symbol.upper(),
"seq_len": seq_len,
"epochs": epochs,
"batch_size": batch_size,
"train_size": split_idx,
"timestamps": {
"start": df.index.min().strftime("%Y-%m-%d"),
"end": df.index.max().strftime("%Y-%m-%d"),
"trained_at_utc": datetime.utcnow().isoformat()
},
"metrics": {"rmse": rmse, "mae": mae}
}, f, indent=2)
return {"rmse": rmse, "mae": mae, "rows": len(df), "symbol": symbol.upper()}
|