Spaces:

will702
/

stockpro-ml

Sleeping

App Files Files Community

will702 commited on Mar 23

Commit

7af21f9

1 Parent(s): 65a23c3

fix: track app/models/ package, ignore only top-level /models/

Browse files

Files changed (5) hide show

.gitignore +1 -1
app/models/__init__.py +0 -0
app/models/ddg_da.py +191 -0
app/models/embeddings.py +114 -0
app/models/tft_predictor.py +249 -0

.gitignore CHANGED Viewed

@@ -2,7 +2,7 @@ venv/
 __pycache__/
 *.pyc
 .env
-models/
 *.ckpt
 *.pt
 checkpoints/

 __pycache__/
 *.pyc
 .env
+/models/
 *.ckpt
 *.pt
 checkpoints/

app/models/__init__.py ADDED Viewed

File without changes

app/models/ddg_da.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+DDG-DA: Data Distribution Generation for Predictable Concept Drift Adaptation.
+Implements:
+  1. DriftPredictorMLP   — small MLP that predicts the next distribution snapshot
+  2. DDGDAPredictor      — orchestrator: drift detection + drift score reporting
+Note: DDG-DA head fine-tuning is model-architecture-specific and is handled
+separately. This module provides drift detection that works with any TFT backend.
+Reference: "DDG-DA: Data Distribution Generation for Predictable Concept Drift
+Adaptation" (Data-Centric AI workshop).
+"""
+from __future__ import annotations
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+from typing import Optional
+from app.services.concept_drift import (
+    SNAPSHOT_DIM,
+    K_HISTORY,
+    SNAPSHOT_WINDOW,
+    DriftState,
+    compute_snapshot,
+    extract_snapshots_from_series,
+    compute_drift_score,
+)
+# ── 1. Drift Predictor MLP ────────────────────────────────────────────────────
+class DriftPredictorMLP(nn.Module):
+    """
+    Predicts the NEXT distribution snapshot from the last K snapshots.
+    Input:  (batch, K * SNAPSHOT_DIM) = (batch, 8 * 44 = 352)
+    Output: (batch, SNAPSHOT_DIM)     = (batch, 44)
+    ~51K parameters, ~200KB model file on disk.
+    """
+    def __init__(self, k_history: int = K_HISTORY, snapshot_dim: int = SNAPSHOT_DIM, hidden: int = 128):
+        super().__init__()
+        self.input_dim = k_history * snapshot_dim
+        self.net = nn.Sequential(
+            nn.Linear(self.input_dim, hidden),
+            nn.ELU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden, snapshot_dim),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+    def predict_next(self, history_snapshots: np.ndarray) -> np.ndarray:
+        """
+        Predict next snapshot from (K, 44) history → (44,) prediction.
+        Pads with zeros if fewer than K history snapshots are available.
+        """
+        k = history_snapshots.shape[0]
+        required = self.input_dim // SNAPSHOT_DIM
+        if k < required:
+            padding = np.zeros((required - k, SNAPSHOT_DIM), dtype=np.float32)
+            history_snapshots = np.concatenate([padding, history_snapshots], axis=0)
+        x = torch.tensor(history_snapshots.flatten()[None], dtype=torch.float32)  # (1, 352)
+        with torch.no_grad():
+            out = self.net(x).squeeze(0).numpy()  # (44,)
+        return out
+# ── 2. DDG-DA Predictor (Orchestrator) ───────────────────────────────────────
+class DDGDAPredictor:
+    """
+    Orchestrates concept drift detection.
+    - Measures current feature distribution (snapshot)
+    - Scores drift vs. historical reference
+    - Reports drift_score and drift_detected for confidence adjustment
+    """
+    def __init__(
+        self,
+        model_path: str,
+        k_history: int = K_HISTORY,
+    ):
+        self.mlp = DriftPredictorMLP(k_history=k_history)
+        self.k_history = k_history
+        self._load_mlp(model_path)
+    def _load_mlp(self, model_path: str) -> None:
+        if os.path.exists(model_path):
+            try:
+                state = torch.load(model_path, map_location="cpu", weights_only=True)
+                self.mlp.load_state_dict(state)
+                self.mlp.eval()
+            except Exception as e:
+                print(f"[ddg_da] Could not load MLP weights: {e}")
+    # ── Public API ────────────────────────────────────────────────────────────
+    def adapt(self, features: np.ndarray) -> DriftState:
+        """
+        Assess concept drift from the current feature distribution.
+        Args:
+            features: (T, N_FEATURES) feature matrix for the current symbol.
+                      Should cover at least SNAPSHOT_WINDOW rows.
+        Returns:
+            DriftState with drift_score, drift_detected, and predicted_next_snapshot.
+        """
+        if len(features) < SNAPSHOT_WINDOW:
+            return DriftState(
+                snapshot=np.zeros(SNAPSHOT_DIM, dtype=np.float32),
+                drift_score=0.0,
+                drift_detected=False,
+            )
+        current_snap = compute_snapshot(features, window=SNAPSHOT_WINDOW)
+        all_snaps = extract_snapshots_from_series(features, window=SNAPSHOT_WINDOW)
+        if len(all_snaps) < 2:
+            return DriftState(snapshot=current_snap, drift_score=0.0, drift_detected=False)
+        drift_score, drift_detected = compute_drift_score(current_snap, all_snaps[:-1])
+        history_for_mlp = all_snaps[-self.k_history:]
+        predicted_next = self.mlp.predict_next(history_for_mlp)
+        return DriftState(
+            snapshot=current_snap,
+            drift_score=drift_score,
+            drift_detected=drift_detected,
+            predicted_next_snapshot=predicted_next,
+        )
+# ── Module-level loader with caching ─────────────────────────────────────────
+_ddg_da: Optional[DDGDAPredictor] = None
+_ddg_da_path_cached: Optional[str] = None
+def _maybe_download_ddg_da(model_path: str) -> bool:
+    """Download ddg_da.pt from HF Hub if not present locally."""
+    if os.path.exists(model_path):
+        return True
+    import app.config as cfg
+    if not cfg.MODEL_REPO or not cfg.HF_TOKEN:
+        return False
+    try:
+        from huggingface_hub import hf_hub_download
+        local = hf_hub_download(
+            repo_id=cfg.MODEL_REPO,
+            filename="ddg_da.pt",
+            token=cfg.HF_TOKEN,
+            local_dir=os.path.dirname(model_path),
+        )
+        if local != model_path:
+            import shutil
+            shutil.copy2(local, model_path)
+        return os.path.exists(model_path)
+    except Exception as e:
+        print(f"[ddg_da] Could not download from HF Hub: {e}")
+        return False
+def load_ddg_da(model_path: str) -> Optional[DDGDAPredictor]:
+    """
+    Load (and cache) the DDG-DA drift predictor.
+    Returns None gracefully if ddg_da.pt is absent — base TFT still works.
+    """
+    global _ddg_da, _ddg_da_path_cached
+    if _ddg_da is not None and _ddg_da_path_cached == model_path:
+        return _ddg_da
+    _maybe_download_ddg_da(model_path)
+    if not os.path.exists(model_path):
+        return None
+    try:
+        predictor = DDGDAPredictor(model_path=model_path)
+        _ddg_da = predictor
+        _ddg_da_path_cached = model_path
+        return _ddg_da
+    except Exception as e:
+        print(f"[ddg_da] Failed to initialize DDGDAPredictor: {e}")
+        return None

app/models/embeddings.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""
+Stock similarity via 16-dimensional feature embeddings.
+Stored in Supabase as JSONB float arrays (no pgvector required).
+Cosine similarity computed here for the /similar endpoint.
+"""
+import numpy as np
+from typing import Optional
+import pandas as pd
+EMBED_DIM = 16
+def _safe_norm(v: np.ndarray) -> float:
+    return float(np.std(v)) or 1.0
+def compute_embedding(
+    closes: np.ndarray,
+    volumes: np.ndarray,
+    sector_id: int = 0,
+) -> np.ndarray:
+    """
+    16-dim embedding per stock:
+    [0]   sector (normalised 0-1)
+    [1]   annualised return (capped ±100%)
+    [2]   annualised volatility
+    [3-6] return autocorrelations lag 1-4
+    [7]   beta proxy (corr with overall index, approx)
+    [8]   volume z-score trend (avg recent vs avg older)
+    [9]   max drawdown
+    [10]  skewness of returns
+    [11]  kurtosis of returns
+    [12-15] rolling return quartiles (q25, q50, q75, q90)
+    """
+    vec = np.zeros(EMBED_DIM, dtype=np.float32)
+    if len(closes) < 30:
+        return vec
+    ret = np.diff(np.log(closes + 1e-9))
+    T = len(ret)
+    # [0] sector normalised 0-1 (max ~12 sectors on IDX)
+    vec[0] = min(sector_id / 12.0, 1.0)
+    # [1] annualised return, capped
+    ann_ret = np.mean(ret) * 252
+    vec[1] = float(np.clip(ann_ret, -1.0, 1.0))
+    # [2] annualised volatility
+    vec[2] = float(np.std(ret) * np.sqrt(252))
+    # [3-6] autocorrelations lag 1-4
+    for lag in range(1, 5):
+        if T > lag + 1:
+            corr = float(np.corrcoef(ret[:-lag], ret[lag:])[0, 1])
+            vec[2 + lag] = corr if not np.isnan(corr) else 0.0
+    # [7] beta proxy: correlation with its own 20-day rolling avg (smoother = lower beta)
+    s = pd.Series(closes)
+    trend = s.rolling(20, min_periods=5).mean().dropna().values
+    if len(trend) > 10:
+        corr = float(np.corrcoef(closes[-len(trend):], trend)[0, 1])
+        vec[7] = corr if not np.isnan(corr) else 0.0
+    # [8] volume trend: mean of recent 20 vs older 20
+    if len(volumes) >= 40:
+        recent = float(np.mean(volumes[-20:]))
+        older = float(np.mean(volumes[-40:-20])) or 1.0
+        vec[8] = float(np.clip((recent / older) - 1, -1, 1))
+    # [9] max drawdown
+    cum = np.cumprod(1 + ret)
+    running_max = np.maximum.accumulate(cum)
+    drawdowns = (cum - running_max) / (running_max + 1e-9)
+    vec[9] = float(np.min(drawdowns))
+    # [10] skewness
+    mu, sigma = np.mean(ret), np.std(ret) or 1
+    vec[10] = float(np.clip(np.mean(((ret - mu) / sigma) ** 3), -3, 3))
+    # [11] excess kurtosis
+    vec[11] = float(np.clip(np.mean(((ret - mu) / sigma) ** 4) - 3, -3, 3))
+    # [12-15] return quartiles normalised by vol
+    if sigma > 0:
+        qs = np.quantile(ret, [0.25, 0.5, 0.75, 0.90])
+        vec[12:16] = np.clip(qs / sigma, -3, 3).astype(np.float32)
+    return vec
+def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+    denom = (np.linalg.norm(a) * np.linalg.norm(b)) + 1e-9
+    return float(np.dot(a, b) / denom)
+def find_similar(
+    target_embedding: list[float],
+    all_embeddings: dict[str, list[float]],
+    top_n: int = 6,
+    exclude_symbol: Optional[str] = None,
+) -> list[dict]:
+    """Return top_n most similar stocks by cosine similarity."""
+    target = np.array(target_embedding, dtype=np.float32)
+    scores = []
+    for symbol, emb in all_embeddings.items():
+        if exclude_symbol and symbol.upper() == exclude_symbol.upper():
+            continue
+        sim = cosine_similarity(target, np.array(emb, dtype=np.float32))
+        scores.append({"symbol": symbol, "similarity": round(sim, 4)})
+    scores.sort(key=lambda x: x["similarity"], reverse=True)
+    return scores[:top_n]

app/models/tft_predictor.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+pytorch-forecasting TFT inference for IDX stock price prediction.
+Loads from Lightning checkpoint (.ckpt) produced by train_colab.py.
+Uses pytorch-forecasting's TimeSeriesDataSet + TemporalFusionTransformer.
+"""
+import os
+import numpy as np
+import pandas as pd
+import torch
+from datetime import datetime, timedelta
+from typing import Optional
+from app.services.feature_engineer import SEQUENCE_LEN, FEATURE_COLS, build_features
+FORECAST_HORIZON = 30
+ENCODER_LENGTH = SEQUENCE_LEN      # 60
+QUANTILES = [0.1, 0.5, 0.9]
+N_QUANTILES = len(QUANTILES)
+TARGET = "close_norm"
+KNOWN_REALS = ["day_sin", "day_cos", "month_sin", "month_cos"]
+UNKNOWN_REALS = ["close_norm", "volume_norm", "rsi", "macd_norm", "bb_width", "atr_norm", "obv_norm"]
+# Column index lookup for build_features() output
+_FEAT_IDX = {col: i for i, col in enumerate(FEATURE_COLS)}
+# ── Model / params caching ────────────────────────────────────────────────────
+_model = None
+_model_path_cached: Optional[str] = None
+_ds_params: Optional[dict] = None
+_ds_params_path_cached: Optional[str] = None
+def _maybe_download(filename: str, local_path: str) -> bool:
+    """Download a file from HF Hub if not present locally."""
+    if os.path.exists(local_path):
+        return True
+    import app.config as cfg
+    if not cfg.MODEL_REPO or not cfg.HF_TOKEN:
+        return False
+    try:
+        from huggingface_hub import hf_hub_download
+        local = hf_hub_download(
+            repo_id=cfg.MODEL_REPO,
+            filename=filename,
+            token=cfg.HF_TOKEN,
+            local_dir=os.path.dirname(local_path),
+        )
+        if local != local_path:
+            import shutil
+            shutil.copy2(local, local_path)
+        return os.path.exists(local_path)
+    except Exception as e:
+        print(f"[tft] Could not download {filename} from HF Hub: {e}")
+        return False
+def load_model(model_path: str):
+    """Load and cache the pytorch-forecasting TFT from a Lightning checkpoint."""
+    global _model, _model_path_cached
+    if _model is not None and _model_path_cached == model_path:
+        return _model
+    _maybe_download("tft_stock.ckpt", model_path)
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"Model checkpoint not found: {model_path}")
+    from pytorch_forecasting import TemporalFusionTransformer
+    model = TemporalFusionTransformer.load_from_checkpoint(model_path, map_location="cpu")
+    model.eval()
+    _model = model
+    _model_path_cached = model_path
+    print(f"[tft] Loaded pytorch-forecasting TFT from {model_path}")
+    return model
+def load_dataset_params(params_path: str) -> dict:
+    """Load and cache the TimeSeriesDataSet parameters saved during Colab training."""
+    global _ds_params, _ds_params_path_cached
+    if _ds_params is not None and _ds_params_path_cached == params_path:
+        return _ds_params
+    _maybe_download("dataset_params.pt", params_path)
+    if not os.path.exists(params_path):
+        raise FileNotFoundError(f"Dataset params not found: {params_path}")
+    params = torch.load(params_path, map_location="cpu", weights_only=False)
+    _ds_params = params
+    _ds_params_path_cached = params_path
+    print(f"[tft] Loaded dataset params from {params_path}")
+    return params
+# ── Inference DataFrame builder ───────────────────────────────────────────────
+def _build_inference_df(
+    closes: np.ndarray,
+    volumes: np.ndarray,
+    timestamps: np.ndarray,
+    symbol: str,
+) -> pd.DataFrame:
+    """
+    Build a DataFrame with ENCODER_LENGTH encoder rows + FORECAST_HORIZON future rows.
+    The encoder rows contain real feature values; future rows have only known reals
+    (day/month cyclicals) — the decoder does not use unknown future reals.
+    """
+    features = build_features(closes, volumes, timestamps)  # (T, 11)
+    if len(features) < ENCODER_LENGTH:
+        raise ValueError(f"Need at least {ENCODER_LENGTH} candles, got {len(features)}")
+    features = features[-ENCODER_LENGTH:]
+    ts_slice = timestamps[-len(features):]
+    # Timestamps → Python datetimes
+    dates = [datetime.utcfromtimestamp(int(ts)) for ts in ts_slice]
+    # Build encoder rows
+    rows = []
+    for i, (feat_row, dt) in enumerate(zip(features, dates)):
+        row: dict = {
+            "ticker": symbol,
+            "time_idx": i,
+            "date": dt,
+        }
+        for col in UNKNOWN_REALS + KNOWN_REALS:
+            row[col] = float(feat_row[_FEAT_IDX[col]])
+        rows.append(row)
+    encoder_df = pd.DataFrame(rows)
+    # Build future decoder rows (known reals computed from calendar)
+    last_date = dates[-1]
+    future_rows = []
+    for i in range(1, FORECAST_HORIZON + 1):
+        future_date = last_date + timedelta(days=i)
+        future_rows.append({
+            "ticker": symbol,
+            "time_idx": ENCODER_LENGTH + i - 1,
+            "date": future_date,
+            # Unknown reals: placeholder values (not used in decoder future steps)
+            TARGET: 0.0,
+            "volume_norm": 0.0,
+            "rsi": 0.5,
+            "macd_norm": 0.0,
+            "bb_width": 0.0,
+            "atr_norm": 0.0,
+            "obv_norm": 0.0,
+            # Known reals: actual calendar features
+            "day_sin":   float(np.sin(2 * np.pi * future_date.weekday() / 5)),
+            "day_cos":   float(np.cos(2 * np.pi * future_date.weekday() / 5)),
+            "month_sin": float(np.sin(2 * np.pi * future_date.month / 12)),
+            "month_cos": float(np.cos(2 * np.pi * future_date.month / 12)),
+        })
+    return pd.concat([encoder_df, pd.DataFrame(future_rows)], ignore_index=True)
+# ── Inference ─────────────────────────────────────────────────────────────────
+def predict_quantiles(
+    closes: np.ndarray,
+    volumes: np.ndarray,
+    timestamps: np.ndarray,
+    days: int,
+    model_path: str,
+    dataset_params_path: Optional[str] = None,
+    symbol: str = "UNKNOWN",
+) -> dict:
+    """
+    Run pytorch-forecasting TFT inference for `days` forecast horizon.
+    Returns quantile predictions as price levels (denormalized).
+    """
+    if dataset_params_path is None:
+        dataset_params_path = model_path.replace("tft_stock.ckpt", "dataset_params.pt")
+    model = load_model(model_path)
+    ds_params = load_dataset_params(dataset_params_path)
+    days = max(1, min(days, FORECAST_HORIZON))
+    current_price = float(closes[-1])
+    roll_mean = float(np.mean(closes[-30:]))
+    roll_std = float(np.std(closes[-30:])) or 1.0
+    # Build inference DataFrame
+    full_df = _build_inference_df(closes, volumes, timestamps, symbol)
+    # Reconstruct TimeSeriesDataSet from training-time parameters.
+    # from_parameters() reuses the fitted categorical encoder (ticker → int),
+    # so unknown tickers fall back to the UNK embedding gracefully.
+    from pytorch_forecasting import TimeSeriesDataSet
+    pred_ds = TimeSeriesDataSet.from_parameters(
+        ds_params,
+        full_df,
+        predict=True,          # one sample per group, from the end of data
+        stop_randomization=True,
+        min_encoder_length=ENCODER_LENGTH,
+        max_encoder_length=ENCODER_LENGTH,
+        min_prediction_length=FORECAST_HORIZON,
+        max_prediction_length=FORECAST_HORIZON,
+        min_prediction_idx=None,
+    )
+    pred_dl = pred_ds.to_dataloader(train=False, batch_size=1, num_workers=0)
+    # Predict — returns tensor of shape (1, FORECAST_HORIZON, N_QUANTILES)
+    with torch.no_grad():
+        raw = model.predict(pred_dl, mode="quantiles", return_x=False)
+    # Handle both tensor and list returns
+    if isinstance(raw, torch.Tensor):
+        preds = raw.squeeze(0).cpu().numpy()     # (FORECAST_HORIZON, 3)
+    else:
+        preds = np.array(raw[0])                  # (FORECAST_HORIZON, 3)
+    preds = preds[:days]  # slice to requested horizon
+    # Denormalize rolling z-score → price levels
+    q10 = [max(0.0, round(float(z * roll_std + roll_mean), 2)) for z in preds[:, 0]]
+    q50 = [max(0.0, round(float(z * roll_std + roll_mean), 2)) for z in preds[:, 1]]
+    q90 = [max(0.0, round(float(z * roll_std + roll_mean), 2)) for z in preds[:, 2]]
+    # Enforce monotonic bounds (q10 ≤ q50 ≤ q90)
+    for i in range(days):
+        q10[i] = min(q10[i], q50[i])
+        q90[i] = max(q90[i], q50[i])
+    final_price = q50[-1]
+    trend = (
+        "bullish"  if final_price > current_price * 1.005
+        else "bearish" if final_price < current_price * 0.995
+        else "sideways"
+    )
+    change_pct = (final_price - current_price) / current_price * 100
+    return {
+        "method": "tft",
+        "predictions": q50,
+        "lower_bound": q10,
+        "upper_bound": q90,
+        "target_price": final_price,
+        "trend": trend,
+        "change_pct": round(change_pct, 2),
+        "confidence": 72,
+        "support": round(min(q10), 2),
+        "resistance": round(max(q90), 2),
+        "feature_importance": {},  # TFT attention weights available via interpret_output() if needed
+    }