Spaces:

MLSpeech
/

StockSenseSpace

Sleeping

App Files Files Community

amitke commited on Feb 12

Commit

09fbd2c

1 Parent(s): f3bda49

test

Browse files

Files changed (3) hide show

inference.py +18 -3
testing.py +71 -4
train.py +58 -7

inference.py CHANGED Viewed

@@ -46,8 +46,14 @@ def predict_next(symbol: str, n_days: int = 1):
     model, scaler, meta = _load_artifacts(symbol)
     seq_len = meta["seq_len"]
-    closes = _last_close_series(symbol, days=max(400, seq_len*5))
-    scaled = scaler.transform(closes)
     # seed window
     window = scaled[-seq_len:].reshape(1, seq_len, 1).astype(np.float32)
@@ -63,5 +69,14 @@ def predict_next(symbol: str, n_days: int = 1):
         window_t = torch.from_numpy(window.astype(np.float32))
     preds_scaled = np.array(preds_scaled, dtype=np.float32).reshape(-1, 1)
-    preds_unscaled = scaler.inverse_transform(preds_scaled).flatten().tolist()
     return {"symbol": symbol.upper(), "days": n_days, "predictions": preds_unscaled, "seq_len": seq_len, "meta": meta}

     model, scaler, meta = _load_artifacts(symbol)
     seq_len = meta["seq_len"]
+    closes = _last_close_series(symbol, days=max(400, seq_len*5 + 20))
+    # Compute log returns on the fly
+    # closes is [N, 1]
+    prices = closes.flatten()
+    returns = np.log(prices[1:] / prices[:-1]).reshape(-1, 1)
+    scaled = scaler.transform(returns)
     # seed window
     window = scaled[-seq_len:].reshape(1, seq_len, 1).astype(np.float32)
         window_t = torch.from_numpy(window.astype(np.float32))
     preds_scaled = np.array(preds_scaled, dtype=np.float32).reshape(-1, 1)
+    preds_returns = scaler.inverse_transform(preds_scaled).flatten()
+    # Reconstruct prices from the last known close
+    last_close = prices[-1]
+    curr = last_close
+    preds_unscaled = []
+    for r in preds_returns:
+        curr = curr * np.exp(r)
+        preds_unscaled.append(curr)
     return {"symbol": symbol.upper(), "days": n_days, "predictions": preds_unscaled, "seq_len": seq_len, "meta": meta}

testing.py CHANGED Viewed

@@ -26,9 +26,15 @@ def evaluate(symbol: str):
     end = datetime.utcnow().date()
     start = end - timedelta(days=5*365)
     df = yf.download(symbol, start=start.isoformat(), end=end.isoformat(), progress=False, auto_adjust=True)
-    data = df[["Close"]].dropna().values
-    scaled = scaler.transform(data)
     split_idx = int(len(scaled) * 0.8)
     test_scaled = scaled[split_idx - seq_len:]  # include tail of train for continuity
@@ -42,8 +48,69 @@ def evaluate(symbol: str):
     X_t = torch.from_numpy(X)       # [N, T, 1]
     pred_scaled = model(X_t).numpy()
-    pred = scaler.inverse_transform(pred_scaled)
-    y_true = scaler.inverse_transform(y)
     rmse = math.sqrt(mean_squared_error(y_true, pred))
     mae = mean_absolute_error(y_true, pred)

     end = datetime.utcnow().date()
     start = end - timedelta(days=5*365)
     df = yf.download(symbol, start=start.isoformat(), end=end.isoformat(), progress=False, auto_adjust=True)
+    data = df[["Close"]].dropna()
+    # Compute returns
+    data['LogReturn'] = np.log(data['Close'] / data['Close'].shift(1))
+    data = data.dropna()
+    returns = data['LogReturn'].values.reshape(-1, 1)
+    scaled = scaler.transform(returns)
     split_idx = int(len(scaled) * 0.8)
     test_scaled = scaled[split_idx - seq_len:]  # include tail of train for continuity
     X_t = torch.from_numpy(X)       # [N, T, 1]
     pred_scaled = model(X_t).numpy()
+    # inverse returns
+    pred_returns = scaler.inverse_transform(pred_scaled).flatten()
+    # Reconstruct prices
+    # We need the price before the first test prediction
+    # The test set in 'data' starts at split_idx
+    # The first prediction corresponds to return at split_idx
+    # So base is price at split_idx - 1
+    # Note: 'data' here is the return-df (shifted).
+    # We need indices from the original df.
+    # It's cleaner to just align by length.
+    # Get original prices aligned with returns
+    # df['Close'] has N+1 items if returns has N items.
+    # data indices are a subset of df indices
+    # Let's match by index
+    test_indices = data.index[split_idx:]
+    # Price predecessors (bases)
+    # If a return is at time t, it depends on Price[t-1]
+    # Simple reconstruction:
+    # Get the price immediately preceding the test set
+    base_price_idx = split_idx - 1
+    if base_price_idx < 0:
+        # Fallback if split is at 0 (unlikely)
+        base_price = df['Close'].iloc[0]
+    else:
+        # The return at data.iloc[base_price_idx] is NOT the price
+        # data only has returns.
+        # We need to look at the original DF
+        # The 'data' was created by dropping first row of df.
+        # So data.iloc[0] corresponds to df.iloc[1].
+        # data.iloc[split_idx] is roughly df.iloc[split_idx+1]
+        # Exact alignment:
+        # data index i matches df index i (if we kept index)
+        pass
+    # Let's rely on the original df
+    # The returns in "y" (targets) correspond to `data.iloc[split_idx:]`
+    # The Prices we want to compare against are `df['Close'][data.index[split_idx:]]`
+    y_true_prices = df['Close'].loc[data.index[split_idx:]].values
+    # Base price for the FIRST prediction:
+    # The first return predicted is for data.index[split_idx]
+    # So we need Price at data.index[split_idx-1] (previous day)
+    # OR simpler: df['Close'].loc[data.index[split_idx-1]]
+    first_test_idx_pos = df.index.get_loc(data.index[split_idx])
+    base_price = df['Close'].iloc[first_test_idx_pos - 1]
+    reconstructed = []
+    curr = base_price
+    for r in pred_returns:
+        curr = curr * np.exp(r)
+        reconstructed.append(curr)
+    pred = np.array(reconstructed)
+    y_true = y_true_prices[:len(pred)] # sync lengths
     rmse = math.sqrt(mean_squared_error(y_true, pred))
     mae = mean_absolute_error(y_true, pred)

train.py CHANGED Viewed

@@ -4,7 +4,7 @@ import numpy as np
 import pandas as pd
 import yfinance as yf
-from sklearn.preprocessing import MinMaxScaler
 from sklearn.metrics import mean_absolute_error, mean_squared_error
 import torch
@@ -49,9 +49,17 @@ def train(symbol: str, seq_len: int = 60, epochs: int = 5, batch_size: int = 32,
     # --- data ---
     df = fetch_data(symbol, start, end)
-    data = df['Close'].values.reshape(-1, 1)
-    scaler = MinMaxScaler((0, 1))
     scaled = scaler.fit_transform(data)
     split_idx = int(len(scaled) * 0.8)
@@ -107,9 +115,52 @@ def train(symbol: str, seq_len: int = 60, epochs: int = 5, batch_size: int = 32,
     model.eval()
     with torch.no_grad():
         X_t = torch.from_numpy(X_test_like_train).float().to(device)
-        preds_scaled = model(X_t).cpu().numpy()  # scaled space
-    preds = scaler.inverse_transform(preds_scaled)
-    y_true = scaler.inverse_transform(y_test_like_train)
     rmse = math.sqrt(mean_squared_error(y_true, preds))
     mae = mean_absolute_error(y_true, preds)

 import pandas as pd
 import yfinance as yf
+from sklearn.preprocessing import StandardScaler
 from sklearn.metrics import mean_absolute_error, mean_squared_error
 import torch
     # --- data ---
     df = fetch_data(symbol, start, end)
+    # Calculate Log Returns: ln(Pt / Pt-1)
+    # This makes the data stationary and solves scaling issues with absolute prices
+    df['LogReturn'] = np.log(df['Close'] / df['Close'].shift(1))
+    df = df.dropna()
+    data = df['LogReturn'].values.reshape(-1, 1)
+    # Use StandardScaler for returns (centered around 0)
+    from sklearn.preprocessing import StandardScaler
+    scaler = StandardScaler()
     scaled = scaler.fit_transform(data)
     split_idx = int(len(scaled) * 0.8)
     model.eval()
     with torch.no_grad():
         X_t = torch.from_numpy(X_test_like_train).float().to(device)
+        preds_scaled = model(X_t).cpu().numpy()  # scaled log-returns
+    # Inverse transform to get actual log-returns
+    preds_returns = scaler.inverse_transform(preds_scaled).flatten()
+    y_true_returns = scaler.inverse_transform(y_test_like_train).flatten()
+    # Reconstruct Prices
+    # We need the reference price just before the test sequence started
+    # The 'test_scaled' starts at split_idx.
+    # The corresponding Price index in df is also split_idx (after dropna for shift).
+    # actually, X_test_like_train covers the test set.
+    # We need the price at (split_idx - 1) as the base for the first return in test set.
+    # Get original prices corresponding to the test set
+    # The test set indices in 'data' start at split_idx
+    # So the price at split_idx corresponds to the first return in test_scaled
+    # Price[t] = Price[t-1] * exp(Return[t])
+    test_prices = df['Close'].values[split_idx:]
+    # validation: len(test_prices) should equal len(y_test_like_train)
+    # But make_sequences consumes 'seq_len' from the start.
+    # X_test_like_train was built from [train_scaled[-seq_len:], test_scaled]
+    # So it actually produces predictions for ALL of test_scaled.
+    # Let's reconstruct systematically:
+    # We need the price that precedes the first prediction.
+    # The first target in y_test_like_train corresponds to `test_scaled[0]`.
+    # The price for that is df['Close'].iloc[split_idx].
+    # The PREVIOUS price is df['Close'].iloc[split_idx - 1].
+    base_price = df['Close'].iloc[split_idx - 1]
+    reconstructed_preds = []
+    curr = base_price
+    for r in preds_returns:
+        curr = curr * np.exp(r)
+        reconstructed_preds.append(curr)
+    # We can perform the same for standard checks, or just compare against actual test prices
+    # actual test prices:
+    actual_prices = df['Close'].iloc[split_idx:].values
+    # Truncate if lengths differ (rare with this logic but good for safety)
+    min_len = min(len(reconstructed_preds), len(actual_prices))
+    preds = np.array(reconstructed_preds[:min_len])
+    y_true = actual_prices[:min_len]
     rmse = math.sqrt(mean_squared_error(y_true, preds))
     mae = mean_absolute_error(y_true, preds)