Spaces:

P2SAMAPA
/

P2-ETF-CNN-LSTM-ALTERNATIVE-APPROACHES

Running

File size: 10,719 Bytes

02eaa3c
 
 
7455898
 
02eaa3c
 
 
 
 
 
 
 
 
 
 
 
 
7455898
 
25f0954
d1dbf91
 
 
02eaa3c
d1dbf91
5b7a80a
02eaa3c
 
 
 
 
 
5b7a80a
d1dbf91
5b7a80a
 
02eaa3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b7a80a
 
 
 
25f0954
d1dbf91
02eaa3c
d1dbf91
02eaa3c
 
 
 
 
d1dbf91
 
 
 
 
25f0954
 
d1dbf91
 
 
 
 
7455898
 
 
d1dbf91
 
7455898
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25f0954
7455898
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25f0954
 
 
7455898
 
 
 
 
 
 
 
02eaa3c
7455898
 
02eaa3c
7455898
 
 
02eaa3c
 
7455898
02eaa3c
d1dbf91
 
 
7455898
 
02eaa3c
5b7a80a
 
02eaa3c
5b7a80a
7455898
02eaa3c
 
d1dbf91
5b7a80a
7455898
 
 
d1dbf91
7455898
 
 
 
 
 
 
 
 
 
 
d1dbf91
 
7455898
 
 
 
 
 
5b7a80a
7455898
 
 
5b7a80a
 
d1dbf91
02eaa3c
 
 
d1dbf91
 
02eaa3c
7455898
 
 
 
 
 
 
 
d1dbf91
02eaa3c
5b7a80a
02eaa3c
 
 
 
5b7a80a
 
 
 
d1dbf91
 
 
02eaa3c
d1dbf91
02eaa3c

"""
data/loader.py
Loads master_data.parquet from HF Dataset.
Engineers rich feature set from raw price/macro columns.
No external pings — all data from HF Dataset only.
"""
import pandas as pd
import numpy as np
import streamlit as st
from huggingface_hub import hf_hub_download
from datetime import datetime, timedelta
import pytz
try:
    import pandas_market_calendars as mcal
    NYSE_CAL_AVAILABLE = True
except ImportError:
    NYSE_CAL_AVAILABLE = False

DATASET_REPO    = "P2SAMAPA/fi-etf-macro-signal-master-data"
PARQUET_FILE    = "master_data.parquet"
TARGET_ETF_COLS = ["TLT", "VNQ", "SLV", "GLD", "LQD", "HYG", "VCIT"]
BENCHMARK_COLS  = ["SPY", "AGG"]
TBILL_COL       = "TBILL_3M"
MACRO_COLS      = ["VIX", "DXY", "T10Y2Y", "IG_SPREAD", "HY_SPREAD"]

# ── NYSE calendar ─────────────────────────────────────────────────────────────
def get_last_nyse_trading_day(as_of=None):
    est = pytz.timezone("US/Eastern")
    if as_of is None:
        as_of = datetime.now(est)
    today = as_of.date()
    if NYSE_CAL_AVAILABLE:
        try:
            nyse  = mcal.get_calendar("NYSE")
            sched = nyse.schedule(start_date=today - timedelta(days=10), end_date=today)
            if len(sched) > 0:
                return sched.index[-1].date()
        except Exception:
            pass
    candidate = today
    while candidate.weekday() >= 5:
        candidate -= timedelta(days=1)
    return candidate

# ── Data loading ──────────────────────────────────────────────────────────────
@st.cache_data(ttl=3600, show_spinner=False)
def load_dataset(hf_token: str) -> pd.DataFrame:
    try:
        path = hf_hub_download(
            repo_id=DATASET_REPO,
            filename=PARQUET_FILE,
            repo_type="dataset",
            token=hf_token,
        )
        df = pd.read_parquet(path)
        if not isinstance(df.index, pd.DatetimeIndex):
            for col in ["Date", "date", "DATE"]:
                if col in df.columns:
                    df = df.set_index(col)
                    break
        df.index = pd.to_datetime(df.index)
        return df.sort_index()
    except Exception as e:
        st.error(f"❌ Failed to load dataset: {e}")
        return pd.DataFrame()

# ── Freshness check ───────────────────────────────────────────────────────────
def check_data_freshness(df: pd.DataFrame) -> dict:
    if df.empty:
        return {"fresh": False, "message": "Dataset is empty."}
    last   = df.index[-1].date()
    expect = get_last_nyse_trading_day()
    fresh  = last >= expect
    msg = (
        f"✅ Dataset up to date through {last}." if fresh else
        f"⚠️ {expect} data not yet updated. Latest: {last}. "
        f"Dataset updates daily after market close."
    )
    return {"fresh": fresh, "last_date_in_data": last,
            "expected_date": expect, "message": msg}

# ── Price → returns ───────────────────────────────────────────────────────────
def _to_returns(series: pd.Series) -> pd.Series:
    """Convert price series to daily pct returns. If already returns, pass through."""
    clean = series.dropna()
    if len(clean) == 0:
        return series
    if abs(clean.median()) > 2:          # price series
        return series.pct_change()
    return series                         # already returns

# ── Feature engineering ───────────────────────────────────────────────────────
def _engineer_features(df: pd.DataFrame, ret_cols: list) -> pd.DataFrame:
    """
    Build a rich feature set from raw macro + ETF return columns.
    Features added per ETF return:
      - 1d, 5d, 21d lagged returns
      - 5d, 21d rolling volatility
      - 5d, 21d momentum (cumulative return)

    Features added per macro column:
      - raw value (z-scored over rolling 252d window)
      - 5d change
      - 1d lag

    Also adds:
      - TBILL_3M as a feature (rate level)
      - VIX regime flag (VIX > 25)
      - Yield curve slope (already T10Y2Y)
      - Cross-asset momentum: spread between TLT_ret and AGG_ret
    """
    feat = pd.DataFrame(index=df.index)

    # ── ETF return features ───────────────────────────────────────────────────
    for col in ret_cols:
        r = df[col]
        feat[f"{col}_lag1"]  = r.shift(1)
        feat[f"{col}_lag5"]  = r.shift(5)
        feat[f"{col}_lag21"] = r.shift(21)
        feat[f"{col}_vol5"]  = r.rolling(5).std()
        feat[f"{col}_vol21"] = r.rolling(21).std()
        feat[f"{col}_mom5"]  = r.rolling(5).sum()
        feat[f"{col}_mom21"] = r.rolling(21).sum()

    # ── Macro features ────────────────────────────────────────────────────────
    for col in MACRO_COLS:
        if col not in df.columns:
            continue
        s = df[col]
        # Z-score over rolling 252-day window
        roll_mean = s.rolling(252, min_periods=63).mean()
        roll_std  = s.rolling(252, min_periods=63).std()
        feat[f"{col}_z"]     = (s - roll_mean) / (roll_std + 1e-9)
        feat[f"{col}_chg5"]  = s.diff(5)
        feat[f"{col}_lag1"]  = s.shift(1)

    # ── TBILL level ───────────────────────────────────────────────────────────
    if TBILL_COL in df.columns:
        tbill = df[TBILL_COL]
        feat["TBILL_level"] = tbill
        feat["TBILL_chg5"]  = tbill.diff(5)

    # ── Derived cross-asset signals ───────────────────────────────────────────
    if "TLT_Ret" in df.columns and "AGG_Ret" in df.columns:
        feat["TLT_AGG_spread_mom5"] = (
            df["TLT_Ret"].rolling(5).sum() - df["AGG_Ret"].rolling(5).sum()
        )

    if "VIX" in df.columns:
        feat["VIX_regime"] = (df["VIX"] > 25).astype(float)
        feat["VIX_mom5"]   = df["VIX"].diff(5)

    if "T10Y2Y" in df.columns:
        feat["YC_inverted"] = (df["T10Y2Y"] < 0).astype(float)

    if "IG_SPREAD" in df.columns and "HY_SPREAD" in df.columns:
        feat["credit_ratio"] = df["HY_SPREAD"] / (df["IG_SPREAD"] + 1e-9)

    return feat

# ── Main extraction function ──────────────────────────────────────────────────
def get_features_and_targets(df: pd.DataFrame):
    """
    Build return columns for target ETFs and engineer a rich feature set.
    Returns:
        input_features : list[str]
        target_etfs    : list[str]  e.g. ["TLT_Ret", ...]
        tbill_rate     : float
        df_out         : DataFrame with all columns
        col_info       : dict of diagnostics
    """
    missing = [c for c in TARGET_ETF_COLS if c not in df.columns]
    if missing:
        raise ValueError(
            f"Missing ETF columns: {missing}. "
            f"Found: {list(df.columns)}"
        )

    col_info = {}

    # ── Build ETF return columns ──────────────────────────────────────────────
    target_etfs = []
    for col in TARGET_ETF_COLS:
        ret_col = f"{col}_Ret"
        df[ret_col] = _to_returns(df[col])
        med = abs(df[col].dropna().median())
        col_info[col] = f"price→pct_change (median={med:.2f})" if med > 2 else f"used as-is (median={med:.4f})"
        target_etfs.append(ret_col)

    # ── Build benchmark return columns ────────────────────────────────────────
    for col in BENCHMARK_COLS:
        if col in df.columns:
            df[f"{col}_Ret"] = _to_returns(df[col])

    # ── Drop NaN from first pct_change row ────────────────────────────────────
    df = df.dropna(subset=target_etfs).copy()

    # ── Engineer features ─────────────────────────────────────────────────────
    feat_df = _engineer_features(df, target_etfs)

    # Merge features into df
    for col in feat_df.columns:
        df[col] = feat_df[col].values

    # Drop rows with NaN in features (from lags/rolling)
    feat_cols = list(feat_df.columns)
    df = df.dropna(subset=feat_cols).copy()

    # ── T-bill rate ───────────────────────────────────────────────────────────
    tbill_rate = 0.045
    if TBILL_COL in df.columns:
        raw = df[TBILL_COL].dropna()
        if len(raw) > 0:
            v = float(raw.iloc[-1])
            tbill_rate = v / 100 if v > 1 else v

    # Input features = all engineered feature columns
    exclude = set(
        TARGET_ETF_COLS + BENCHMARK_COLS + target_etfs +
        [f"{c}_Ret" for c in BENCHMARK_COLS] + [TBILL_COL] +
        list(MACRO_COLS)
    )
    input_features = [c for c in feat_cols if c not in exclude]

    return input_features, target_etfs, tbill_rate, df, col_info

# ── Dataset summary ───────────────────────────────────────────────────────────
def dataset_summary(df: pd.DataFrame) -> dict:
    if df.empty:
        return {}
    return {
        "rows":        len(df),
        "columns":     len(df.columns),
        "start_date":  df.index[0].strftime("%Y-%m-%d"),
        "end_date":    df.index[-1].strftime("%Y-%m-%d"),
        "etfs_found":  [c for c in TARGET_ETF_COLS if c in df.columns],
        "benchmarks":  [c for c in BENCHMARK_COLS  if c in df.columns],
        "macro_found": [c for c in MACRO_COLS       if c in df.columns],
        "tbill_found": TBILL_COL in df.columns,
        "all_cols":    list(df.columns),
    }