diff --git a/app.py b/app.py index bc6e5582806fd23c779112cf110ba46101a0e27d..8145c1839d6d05bd22a71a19079075f3734c3513 100644 --- a/app.py +++ b/app.py @@ -8,6 +8,7 @@ P2-ETF-CNN-LSTM-ALTERNATIVE-APPROACHES - Ann. Return compared vs SPY in metrics row - Max Daily DD shows date it occurred - Conviction panel: compact ETF probability list +- [NEW] Multi-Year Sweep tab: runs 8 start years, vote tally + comparison table """ import os @@ -32,6 +33,7 @@ from ui.components import ( show_metrics_row, show_comparison_table, show_audit_trail, show_all_signals_panel, ) +from ui.multiyear import run_multiyear_sweep, show_multiyear_results st.set_page_config(page_title="P2-ETF-CNN-LSTM", page_icon="π§ ", layout="wide") @@ -43,6 +45,8 @@ for key, default in [ ("test_dates", None), ("test_slice", None), ("optimal_lookback", None), ("df_for_chart", None), ("tbill_rate", None), ("target_etfs", None), ("from_cache", False), + # Multi-year sweep state + ("multiyear_ready", False), ("multiyear_results", None), ]: if key not in st.session_state: st.session_state[key] = default @@ -68,7 +72,6 @@ with st.sidebar: # ββ Title βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.title("π§ P2-ETF-CNN-LSTM") st.caption("Approach 1: Wavelet Β· Approach 2: Regime-Conditioned Β· Approach 3: Multi-Scale Parallel") -st.caption("Winner selected by highest raw annualised return on out-of-sample test set.") if not HF_TOKEN: st.error("β HF_TOKEN secret not found.") @@ -83,6 +86,7 @@ if df_raw.empty: freshness = check_data_freshness(df_raw) show_freshness_status(freshness) +last_date_str = str(freshness.get("last_date_in_data", "unknown")) # ββ Dataset info sidebar ββββββββββββββββββββββββββββββββββββββββββββββββββββββ with st.sidebar: @@ -112,7 +116,7 @@ if run_button: st.stop() n_etfs = len(target_etfs) - n_classes = n_etfs # CASH is overlay only β model always picks from ETFs + n_classes = n_etfs st.info( f"π― **Targets:** {', '.join([t.replace('_Ret','') for t in target_etfs])} Β· " @@ -120,7 +124,6 @@ if run_button: f"**T-bill:** {tbill_rate*100:.2f}%" ) - # ββ Raw arrays ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ X_raw = df[input_features].values.astype(np.float32) y_raw = np.clip(df[target_etfs].values.astype(np.float32), -0.5, 0.5) @@ -133,8 +136,6 @@ if run_button: if mask.any(): y_raw[mask, j] = 0.0 - last_date_str = str(freshness.get("last_date_in_data", "unknown")) - # ββ Auto-select lookback ββββββββββββββββββββββββββββββββββββββββββββββββββ lb_key = make_cache_key(last_date_str, start_yr, fee_bps, int(epochs), split_option, False, 0) @@ -186,12 +187,11 @@ if run_button: results, trained_info = {}, {} progress = st.progress(0, text="Training Approach 1...") - for approach, train_fn, predict_fn, train_kwargs in [ + for approach, train_fn, predict_fn in [ ("Approach 1", lambda: train_approach1(X_train_s, y_train_l, X_val_s, y_val_l, n_classes=n_classes, epochs=int(epochs)), - lambda m: predict_approach1(m[0], X_test_s), - None), + lambda m: predict_approach1(m[0], X_test_s)), ("Approach 2", lambda: train_approach2(X_train_s, y_train_l, X_val_s, y_val_l, X_flat_all=X_raw, feature_names=input_features, @@ -199,13 +199,11 @@ if run_button: val_size=val_size, n_classes=n_classes, epochs=int(epochs)), lambda m: predict_approach2(m[0], X_test_s, X_raw, m[3], m[2], - lookback, train_size, val_size), - None), + lookback, train_size, val_size)), ("Approach 3", lambda: train_approach3(X_train_s, y_train_l, X_val_s, y_val_l, n_classes=n_classes, epochs=int(epochs)), - lambda m: predict_approach3(m[0], X_test_s), - None), + lambda m: predict_approach3(m[0], X_test_s)), ]: try: model_out = train_fn() @@ -229,7 +227,6 @@ if run_button: "test_dates": list(test_dates), "test_slice": test_slice, }) - # ββ Persist to session state ββββββββββββββββββββββββββββββββββββββββββββββ st.session_state.update({ "results": results, "trained_info": trained_info, "test_dates": test_dates, "test_slice": test_slice, @@ -238,68 +235,116 @@ if run_button: "output_ready": True, }) -# ββ Render (persists across reruns via session_state) βββββββββββββββββββββββββ -if not st.session_state.output_ready: - st.info("π Configure parameters and click **π Run All 3 Approaches**.") - st.stop() +# ββ TABS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +tab_single, tab_sweep = st.tabs(["π Single-Year Results", "π Multi-Year Consensus Sweep"]) -results = st.session_state.results -trained_info = st.session_state.trained_info -test_dates = st.session_state.test_dates -test_slice = st.session_state.test_slice -optimal_lookback = st.session_state.optimal_lookback -df = st.session_state.df_for_chart -tbill_rate = st.session_state.tbill_rate -target_etfs = st.session_state.target_etfs +# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +# TAB 1 β existing single-year output (unchanged) +# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +with tab_single: + if not st.session_state.output_ready: + st.info("π Configure parameters and click **π Run All 3 Approaches**.") + st.stop() -winner_name = select_winner(results) -winner_res = results.get(winner_name) + results = st.session_state.results + trained_info = st.session_state.trained_info + test_dates = st.session_state.test_dates + test_slice = st.session_state.test_slice + optimal_lookback = st.session_state.optimal_lookback + df = st.session_state.df_for_chart + tbill_rate = st.session_state.tbill_rate + target_etfs = st.session_state.target_etfs -if winner_res is None: - st.error("β All approaches failed.") - st.stop() + winner_name = select_winner(results) + winner_res = results.get(winner_name) -if st.session_state.from_cache: - st.success("β‘ Showing cached results.") + if winner_res is None: + st.error("β All approaches failed.") + st.stop() -next_date = get_next_signal_date() -st.divider() + st.caption("Winner selected by highest raw annualised return on out-of-sample test set.") -show_signal_banner(winner_res["next_signal"], next_date, winner_name) + next_date = get_next_signal_date() + st.divider() -winner_proba = trained_info[winner_name]["proba"] -conviction = compute_conviction(winner_proba[-1], target_etfs, include_cash=False) -show_conviction_panel(conviction) + show_signal_banner(winner_res["next_signal"], next_date, winner_name) -st.divider() + winner_proba = trained_info[winner_name]["proba"] + conviction = compute_conviction(winner_proba[-1], target_etfs, include_cash=False) + show_conviction_panel(conviction) -all_signals = { - name: {"signal": res["next_signal"], - "proba": trained_info[name]["proba"][-1], - "is_winner": name == winner_name} - for name, res in results.items() if res is not None -} -show_all_signals_panel(all_signals, target_etfs, False, next_date, optimal_lookback) + st.divider() -st.divider() -st.subheader(f"π {winner_name} β Performance Metrics") + all_signals = { + name: {"signal": res["next_signal"], + "proba": trained_info[name]["proba"][-1], + "is_winner": name == winner_name} + for name, res in results.items() if res is not None + } + show_all_signals_panel(all_signals, target_etfs, False, next_date, optimal_lookback) -# Compute SPY annualised return directly from raw returns for metrics comparison -spy_ann = None -if "SPY_Ret" in df.columns: - spy_raw = df["SPY_Ret"].iloc[test_slice].values.copy().astype(float) - spy_raw = spy_raw[~np.isnan(spy_raw)] - spy_raw = np.clip(spy_raw, -0.5, 0.5) - if len(spy_raw) > 5: - spy_cum = np.prod(1 + spy_raw) - spy_ann = float(spy_cum ** (252 / len(spy_raw)) - 1) + st.divider() + st.subheader(f"π {winner_name} β Performance Metrics") + + spy_ann = None + if "SPY_Ret" in df.columns: + spy_raw = df["SPY_Ret"].iloc[test_slice].values.copy().astype(float) + spy_raw = spy_raw[~np.isnan(spy_raw)] + spy_raw = np.clip(spy_raw, -0.5, 0.5) + if len(spy_raw) > 5: + spy_cum = np.prod(1 + spy_raw) + spy_ann = float(spy_cum ** (252 / len(spy_raw)) - 1) + + show_metrics_row(winner_res, tbill_rate, spy_ann_return=spy_ann) -show_metrics_row(winner_res, tbill_rate, spy_ann_return=spy_ann) + st.divider() + st.subheader("π Approach Comparison (Winner = Highest Raw Annualised Return)") + show_comparison_table(build_comparison_table(results, winner_name)) -st.divider() -st.subheader("π Approach Comparison (Winner = Highest Raw Annualised Return)") -show_comparison_table(build_comparison_table(results, winner_name)) + st.divider() + st.subheader(f"π Audit Trail β {winner_name} (Last 20 Trading Days)") + show_audit_trail(winner_res["audit_trail"]) + + +# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +# TAB 2 β Multi-Year Consensus Sweep +# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +with tab_sweep: + st.subheader("π Multi-Year Consensus Sweep") + st.markdown( + "Runs the winner model (Approach 2 proxy) across **8 start years** " + "and aggregates signals into a consensus vote. " + "Each year uses the same fee, epochs, and split settings as the sidebar. " + "Results are cached β only untrained years incur compute." + ) -st.divider() -st.subheader(f"π Audit Trail β {winner_name} (Last 20 Trading Days)") -show_audit_trail(winner_res["audit_trail"]) + SWEEP_YEARS = [2010, 2012, 2014, 2016, 2018, 2019, 2021, 2023] + + col_l, col_r = st.columns([2, 1]) + with col_l: + st.caption(f"Sweep years: {', '.join(str(y) for y in SWEEP_YEARS)}") + with col_r: + sweep_button = st.button("π Run Consensus Sweep", type="primary", use_container_width=True) + + if sweep_button: + st.session_state.multiyear_ready = False + sweep_results = run_multiyear_sweep( + df_raw = df_raw, + sweep_years = SWEEP_YEARS, + fee_bps = fee_bps, + epochs = int(epochs), + split_option = split_option, + last_date_str = last_date_str, + train_pct = train_pct, + val_pct = val_pct, + ) + st.session_state.multiyear_results = sweep_results + st.session_state.multiyear_ready = True + + if st.session_state.multiyear_ready and st.session_state.multiyear_results: + show_multiyear_results( + st.session_state.multiyear_results, + sweep_years = SWEEP_YEARS, + ) + elif not st.session_state.multiyear_ready: + st.info("Click **π Run Consensus Sweep** to analyse all start years at once.") diff --git a/hf_space/data/loader.py b/hf_space/data/loader.py index 0f9fef47d7abc5d77c115965ba7fccaa081841a8..144589aa5bdeeb4844546502ccc88faf3dcb764f 100644 --- a/hf_space/data/loader.py +++ b/hf_space/data/loader.py @@ -4,14 +4,12 @@ Loads master_data.parquet from HF Dataset. Engineers rich feature set from raw price/macro columns. No external pings β all data from HF Dataset only. """ - import pandas as pd import numpy as np import streamlit as st from huggingface_hub import hf_hub_download from datetime import datetime, timedelta import pytz - try: import pandas_market_calendars as mcal NYSE_CAL_AVAILABLE = True @@ -20,14 +18,12 @@ except ImportError: DATASET_REPO = "P2SAMAPA/fi-etf-macro-signal-master-data" PARQUET_FILE = "master_data.parquet" -TARGET_ETF_COLS = ["TLT", "TBT", "VNQ", "SLV", "GLD"] +TARGET_ETF_COLS = ["TLT", "VNQ", "SLV", "GLD", "LQD", "HYG", "VCIT"] BENCHMARK_COLS = ["SPY", "AGG"] TBILL_COL = "TBILL_3M" MACRO_COLS = ["VIX", "DXY", "T10Y2Y", "IG_SPREAD", "HY_SPREAD"] - # ββ NYSE calendar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ - def get_last_nyse_trading_day(as_of=None): est = pytz.timezone("US/Eastern") if as_of is None: @@ -46,9 +42,7 @@ def get_last_nyse_trading_day(as_of=None): candidate -= timedelta(days=1) return candidate - # ββ Data loading ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ - @st.cache_data(ttl=3600, show_spinner=False) def load_dataset(hf_token: str) -> pd.DataFrame: try: @@ -64,15 +58,13 @@ def load_dataset(hf_token: str) -> pd.DataFrame: if col in df.columns: df = df.set_index(col) break - df.index = pd.to_datetime(df.index) + df.index = pd.to_datetime(df.index) return df.sort_index() except Exception as e: st.error(f"β Failed to load dataset: {e}") return pd.DataFrame() - # ββ Freshness check βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ - def check_data_freshness(df: pd.DataFrame) -> dict: if df.empty: return {"fresh": False, "message": "Dataset is empty."} @@ -80,16 +72,14 @@ def check_data_freshness(df: pd.DataFrame) -> dict: expect = get_last_nyse_trading_day() fresh = last >= expect msg = ( - f"β Dataset up to date through **{last}**." if fresh else - f"β οΈ **{expect}** data not yet updated. Latest: **{last}**. " + f"β Dataset up to date through {last}." if fresh else + f"β οΈ {expect} data not yet updated. Latest: {last}. " f"Dataset updates daily after market close." ) return {"fresh": fresh, "last_date_in_data": last, "expected_date": expect, "message": msg} - # ββ Price β returns βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ - def _to_returns(series: pd.Series) -> pd.Series: """Convert price series to daily pct returns. If already returns, pass through.""" clean = series.dropna() @@ -99,13 +89,10 @@ def _to_returns(series: pd.Series) -> pd.Series: return series.pct_change() return series # already returns - # ββ Feature engineering βββββββββββββββββββββββββββββββββββββββββββββββββββββββ - def _engineer_features(df: pd.DataFrame, ret_cols: list) -> pd.DataFrame: """ Build a rich feature set from raw macro + ETF return columns. - Features added per ETF return: - 1d, 5d, 21d lagged returns - 5d, 21d rolling volatility @@ -120,7 +107,7 @@ def _engineer_features(df: pd.DataFrame, ret_cols: list) -> pd.DataFrame: - TBILL_3M as a feature (rate level) - VIX regime flag (VIX > 25) - Yield curve slope (already T10Y2Y) - - Cross-asset momentum: spread between TLT_ret and TBT_ret + - Cross-asset momentum: spread between TLT_ret and AGG_ret """ feat = pd.DataFrame(index=df.index) @@ -154,9 +141,9 @@ def _engineer_features(df: pd.DataFrame, ret_cols: list) -> pd.DataFrame: feat["TBILL_chg5"] = tbill.diff(5) # ββ Derived cross-asset signals βββββββββββββββββββββββββββββββββββββββββββ - if "TLT_Ret" in df.columns and "TBT_Ret" in df.columns: - feat["TLT_TBT_spread_mom5"] = ( - df["TLT_Ret"].rolling(5).sum() - df["TBT_Ret"].rolling(5).sum() + if "TLT_Ret" in df.columns and "AGG_Ret" in df.columns: + feat["TLT_AGG_spread_mom5"] = ( + df["TLT_Ret"].rolling(5).sum() - df["AGG_Ret"].rolling(5).sum() ) if "VIX" in df.columns: @@ -171,13 +158,10 @@ def _engineer_features(df: pd.DataFrame, ret_cols: list) -> pd.DataFrame: return feat - # ββ Main extraction function ββββββββββββββββββββββββββββββββββββββββββββββββββ - def get_features_and_targets(df: pd.DataFrame): """ Build return columns for target ETFs and engineer a rich feature set. - Returns: input_features : list[str] target_etfs : list[str] e.g. ["TLT_Ret", ...] @@ -240,9 +224,7 @@ def get_features_and_targets(df: pd.DataFrame): return input_features, target_etfs, tbill_rate, df, col_info - # ββ Dataset summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ - def dataset_summary(df: pd.DataFrame) -> dict: if df.empty: return {} diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py index b944ad0559ead182d0413386f9c49985e5059d66..bc6e5582806fd23c779112cf110ba46101a0e27d 100644 --- a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py @@ -32,7 +32,6 @@ from ui.components import ( show_metrics_row, show_comparison_table, show_audit_trail, show_all_signals_panel, ) -from ui.charts import equity_curve_chart st.set_page_config(page_title="P2-ETF-CNN-LSTM", page_icon="π§ ", layout="wide") @@ -169,7 +168,7 @@ if run_button: st.success("β‘ Results loaded from cache β no retraining needed.") else: X_seq, y_seq = build_sequences(X_raw, y_raw, lookback) - y_labels = returns_to_labels(y_seq, include_cash=False) + y_labels = returns_to_labels(y_seq) (X_train, y_train_r, X_val, y_val_r, X_test, y_test_r) = train_val_test_split(X_seq, y_seq, train_pct, val_pct) @@ -285,18 +284,22 @@ show_all_signals_panel(all_signals, target_etfs, False, next_date, optimal_lookb st.divider() st.subheader(f"π {winner_name} β Performance Metrics") -# Build equity curve first to get spy_ann for metrics comparison -fig, spy_ann = equity_curve_chart(results, winner_name, test_dates, df, test_slice, tbill_rate) +# Compute SPY annualised return directly from raw returns for metrics comparison +spy_ann = None +if "SPY_Ret" in df.columns: + spy_raw = df["SPY_Ret"].iloc[test_slice].values.copy().astype(float) + spy_raw = spy_raw[~np.isnan(spy_raw)] + spy_raw = np.clip(spy_raw, -0.5, 0.5) + if len(spy_raw) > 5: + spy_cum = np.prod(1 + spy_raw) + spy_ann = float(spy_cum ** (252 / len(spy_raw)) - 1) + show_metrics_row(winner_res, tbill_rate, spy_ann_return=spy_ann) st.divider() st.subheader("π Approach Comparison (Winner = Highest Raw Annualised Return)") show_comparison_table(build_comparison_table(results, winner_name)) -st.divider() -st.subheader(f"π {winner_name} vs SPY & AGG β Out-of-Sample") -st.plotly_chart(fig, use_container_width=True) - st.divider() st.subheader(f"π Audit Trail β {winner_name} (Last 20 Trading Days)") show_audit_trail(winner_res["audit_trail"]) diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py index 48a79bdac6d956eecd66f6930db41223bdf743ef..b944ad0559ead182d0413386f9c49985e5059d66 100644 --- a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py @@ -148,8 +148,7 @@ if run_button: with st.spinner("π Auto-selecting optimal lookback (30 / 45 / 60d)..."): optimal_lookback = find_best_lookback( X_raw, y_raw, - lambda y: returns_to_labels(y, include_cash=False), - train_pct, val_pct, n_classes, False, + train_pct, val_pct, n_classes, candidates=[30, 45, 60], ) save_cache(f"lb_{lb_key}", {"optimal_lookback": optimal_lookback}) diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py index 6b05505276c6be72f7860e645761efee6a3b9bf5..48a79bdac6d956eecd66f6930db41223bdf743ef 100644 --- a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py @@ -1,7 +1,13 @@ """ app.py P2-ETF-CNN-LSTM-ALTERNATIVE-APPROACHES -Streamlit orchestrator β UI wiring only, no business logic here. +- Session state persistence (results don't vanish on rerun) +- Model caching keyed by data date + config params +- Auto-lookback (30/45/60d) +- CASH is a drawdown risk overlay (not a model class) +- Ann. Return compared vs SPY in metrics row +- Max Daily DD shows date it occurred +- Conviction panel: compact ETF probability list """ import os @@ -32,11 +38,20 @@ st.set_page_config(page_title="P2-ETF-CNN-LSTM", page_icon="π§ ", layout="wide" HF_TOKEN = os.getenv("HF_TOKEN", "") +# ββ Session state init ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +for key, default in [ + ("output_ready", False), ("results", None), ("trained_info", None), + ("test_dates", None), ("test_slice", None), ("optimal_lookback", None), + ("df_for_chart", None), ("tbill_rate", None), ("target_etfs", None), + ("from_cache", False), +]: + if key not in st.session_state: + st.session_state[key] = default + # ββ Sidebar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ with st.sidebar: st.header("βοΈ Configuration") - now_est = get_est_time() - st.write(f"π **EST:** {now_est.strftime('%H:%M:%S')}") + st.write(f"π **EST:** {get_est_time().strftime('%H:%M:%S')}") st.divider() start_yr = st.slider("π Start Year", 2010, 2024, 2016) @@ -47,9 +62,7 @@ with st.sidebar: split_option = st.selectbox("π Train/Val/Test Split", ["70/15/15", "80/10/10"], index=0) train_pct, val_pct = {"70/15/15": (0.70, 0.15), "80/10/10": (0.80, 0.10)}[split_option] - include_cash = st.checkbox("π΅ Include CASH class", value=True, - help="Model can select CASH (earns T-bill rate) instead of any ETF") - + st.caption("π‘ CASH triggered automatically on 2-day drawdown β€ β15%") st.divider() run_button = st.button("π Run All 3 Approaches", type="primary", use_container_width=True) @@ -85,226 +98,206 @@ with st.sidebar: st.write(f"**Macro:** {', '.join(summary['macro_found'])}") st.write(f"**T-bill col:** {'β ' if summary['tbill_found'] else 'β'}") -if not run_button: - st.info("π Configure parameters and click **π Run All 3 Approaches**.") - st.stop() - -# ββ Filter by start year ββββββββββββββββββββββββββββββββββββββββββββββββββββββ -df = df_raw[df_raw.index.year >= start_yr].copy() -st.write(f"π **Data:** {df.index[0].strftime('%Y-%m-%d')} β {df.index[-1].strftime('%Y-%m-%d')} " - f"({df.index[-1].year - df.index[0].year + 1} years)") +# ββ Run button ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +if run_button: + st.session_state.output_ready = False -# ββ Features & targets ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ -try: - input_features, target_etfs, tbill_rate, df, _ = get_features_and_targets(df) -except ValueError as e: - st.error(str(e)) - st.stop() + df = df_raw[df_raw.index.year >= start_yr].copy() + st.write(f"π **Data:** {df.index[0].strftime('%Y-%m-%d')} β {df.index[-1].strftime('%Y-%m-%d')} " + f"({df.index[-1].year - df.index[0].year + 1} years)") -n_etfs = len(target_etfs) -n_classes = n_etfs + (1 if include_cash else 0) - -st.info( - f"π― **Targets:** {', '.join([t.replace('_Ret','') for t in target_etfs])} Β· " - f"**Features:** {len(input_features)} signals Β· " - f"**T-bill:** {tbill_rate*100:.2f}%" -) - -# ββ Prepare raw arrays ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ -X_raw = df[input_features].values.astype(np.float32) -y_raw = df[target_etfs].values.astype(np.float32) - -for j in range(X_raw.shape[1]): - mask = np.isnan(X_raw[:, j]) - if mask.any(): - X_raw[mask, j] = np.nanmean(X_raw[:, j]) -for j in range(y_raw.shape[1]): - mask = np.isnan(y_raw[:, j]) - if mask.any(): - y_raw[mask, j] = np.nanmean(y_raw[:, j]) - -# ββ Auto-select optimal lookback ββββββββββββββββββββββββββββββββββββββββββββββ -last_date_str = str(freshness.get("last_date_in_data", "unknown")) - -# Check cache for lookback selection too -lb_cache_key = make_cache_key( - last_date_str, start_yr, fee_bps, int(epochs), split_option, include_cash, 0 -) -lb_cached = load_cache(f"lb_{lb_cache_key}") - -if lb_cached is not None: - optimal_lookback = lb_cached["optimal_lookback"] - st.success(f"β‘ Loaded from cache Β· Optimal lookback: **{optimal_lookback}d**") -else: - with st.spinner("π Finding optimal lookback (30 / 45 / 60d)..."): - def _y_labels_fn(y_seq): - return returns_to_labels(y_seq, include_cash=include_cash) - optimal_lookback = find_best_lookback( - X_raw, y_raw, _y_labels_fn, - train_pct, val_pct, n_classes, include_cash, - candidates=[30, 45, 60], - ) - save_cache(f"lb_{lb_cache_key}", {"optimal_lookback": optimal_lookback}) - st.success(f"π Optimal lookback: **{optimal_lookback}d** (auto-selected from 30/45/60)") - -lookback = optimal_lookback - -# ββ Check full model cache ββββββββββββββββββββββββββββββββββββββββββββββββββββ -cache_key = make_cache_key(last_date_str, start_yr, fee_bps, int(epochs), - split_option, include_cash, lookback) -cached_data = load_cache(cache_key) -from_cache = cached_data is not None - -if from_cache: - results = cached_data["results"] - trained_info = cached_data["trained_info"] - test_dates = pd.DatetimeIndex(cached_data["test_dates"]) - test_slice = cached_data["test_slice"] - st.success("β‘ Results loaded from cache β no retraining needed.") -else: - # ββ Build sequences βββββββββββββββββββββββββββββββββββββββββββββββββββββββ - X_seq, y_seq = build_sequences(X_raw, y_raw, lookback) - y_labels = returns_to_labels(y_seq, include_cash=include_cash) - - (X_train, y_train_r, X_val, y_val_r, - X_test, y_test_r) = train_val_test_split(X_seq, y_seq, train_pct, val_pct) - (_, y_train_l, _, y_val_l, - _, _) = train_val_test_split(X_seq, y_labels, train_pct, val_pct) - - X_train_s, X_val_s, X_test_s, _ = scale_features(X_train, X_val, X_test) - - train_size = len(X_train) - val_size = len(X_val) - test_start = lookback + train_size + val_size - test_dates = df.index[test_start: test_start + len(X_test)] - test_slice = slice(test_start, test_start + len(X_test)) - - results = {} - trained_info = {} - progress = st.progress(0, text="Training Approach 1...") - - # ββ Approach 1 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ - try: - model1, _, _ = train_approach1( - X_train_s, y_train_l, X_val_s, y_val_l, - n_classes=n_classes, epochs=int(epochs), - ) - preds1, proba1 = predict_approach1(model1, X_test_s) - results["Approach 1"] = execute_strategy( - preds1, proba1, y_test_r, test_dates, - target_etfs, fee_bps, tbill_rate, include_cash, - ) - trained_info["Approach 1"] = {"proba": proba1} - except Exception as e: - st.warning(f"β οΈ Approach 1 failed: {e}") - results["Approach 1"] = None - - progress.progress(33, text="Training Approach 2...") - - # ββ Approach 2 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ try: - model2, _, hmm2, regime_cols2 = train_approach2( - X_train_s, y_train_l, X_val_s, y_val_l, - X_flat_all=X_raw, feature_names=input_features, - lookback=lookback, train_size=train_size, val_size=val_size, - n_classes=n_classes, epochs=int(epochs), - ) - preds2, proba2 = predict_approach2( - model2, X_test_s, X_raw, regime_cols2, hmm2, - lookback, train_size, val_size, - ) - results["Approach 2"] = execute_strategy( - preds2, proba2, y_test_r, test_dates, - target_etfs, fee_bps, tbill_rate, include_cash, - ) - trained_info["Approach 2"] = {"proba": proba2} - except Exception as e: - st.warning(f"β οΈ Approach 2 failed: {e}") - results["Approach 2"] = None - - progress.progress(66, text="Training Approach 3...") - - # ββ Approach 3 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ - try: - model3, _ = train_approach3( - X_train_s, y_train_l, X_val_s, y_val_l, - n_classes=n_classes, epochs=int(epochs), - ) - preds3, proba3 = predict_approach3(model3, X_test_s) - results["Approach 3"] = execute_strategy( - preds3, proba3, y_test_r, test_dates, - target_etfs, fee_bps, tbill_rate, include_cash, - ) - trained_info["Approach 3"] = {"proba": proba3} - except Exception as e: - st.warning(f"β οΈ Approach 3 failed: {e}") - results["Approach 3"] = None - - progress.progress(100, text="Done!") - progress.empty() - - # ββ Save to cache βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ - save_cache(cache_key, { - "results": results, - "trained_info": trained_info, - "test_dates": list(test_dates), - "test_slice": test_slice, + input_features, target_etfs, tbill_rate, df, _ = get_features_and_targets(df) + except ValueError as e: + st.error(str(e)) + st.stop() + + n_etfs = len(target_etfs) + n_classes = n_etfs # CASH is overlay only β model always picks from ETFs + + st.info( + f"π― **Targets:** {', '.join([t.replace('_Ret','') for t in target_etfs])} Β· " + f"**Features:** {len(input_features)} signals Β· " + f"**T-bill:** {tbill_rate*100:.2f}%" + ) + + # ββ Raw arrays ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + X_raw = df[input_features].values.astype(np.float32) + y_raw = np.clip(df[target_etfs].values.astype(np.float32), -0.5, 0.5) + + for j in range(X_raw.shape[1]): + mask = np.isnan(X_raw[:, j]) + if mask.any(): + X_raw[mask, j] = np.nanmean(X_raw[:, j]) + for j in range(y_raw.shape[1]): + mask = np.isnan(y_raw[:, j]) + if mask.any(): + y_raw[mask, j] = 0.0 + + last_date_str = str(freshness.get("last_date_in_data", "unknown")) + + # ββ Auto-select lookback ββββββββββββββββββββββββββββββββββββββββββββββββββ + lb_key = make_cache_key(last_date_str, start_yr, fee_bps, int(epochs), + split_option, False, 0) + lb_cached = load_cache(f"lb_{lb_key}") + + if lb_cached is not None: + optimal_lookback = lb_cached["optimal_lookback"] + st.success(f"β‘ Cache hit Β· Optimal lookback: **{optimal_lookback}d**") + else: + with st.spinner("π Auto-selecting optimal lookback (30 / 45 / 60d)..."): + optimal_lookback = find_best_lookback( + X_raw, y_raw, + lambda y: returns_to_labels(y, include_cash=False), + train_pct, val_pct, n_classes, False, + candidates=[30, 45, 60], + ) + save_cache(f"lb_{lb_key}", {"optimal_lookback": optimal_lookback}) + st.success(f"π Optimal lookback: **{optimal_lookback}d** (auto-selected from 30/45/60)") + + lookback = optimal_lookback + + # ββ Check model cache βββββββββββββββββββββββββββββββββββββββββββββββββββββ + cache_key = make_cache_key(last_date_str, start_yr, fee_bps, int(epochs), + split_option, False, lookback) + cached_data = load_cache(cache_key) + + if cached_data is not None: + results = cached_data["results"] + trained_info = cached_data["trained_info"] + test_dates = pd.DatetimeIndex(cached_data["test_dates"]) + test_slice = cached_data["test_slice"] + st.success("β‘ Results loaded from cache β no retraining needed.") + else: + X_seq, y_seq = build_sequences(X_raw, y_raw, lookback) + y_labels = returns_to_labels(y_seq, include_cash=False) + + (X_train, y_train_r, X_val, y_val_r, + X_test, y_test_r) = train_val_test_split(X_seq, y_seq, train_pct, val_pct) + (_, y_train_l, _, y_val_l, + _, _) = train_val_test_split(X_seq, y_labels, train_pct, val_pct) + + X_train_s, X_val_s, X_test_s, _ = scale_features(X_train, X_val, X_test) + + train_size = len(X_train) + val_size = len(X_val) + test_start = lookback + train_size + val_size + test_dates = df.index[test_start: test_start + len(X_test)] + test_slice = slice(test_start, test_start + len(X_test)) + + results, trained_info = {}, {} + progress = st.progress(0, text="Training Approach 1...") + + for approach, train_fn, predict_fn, train_kwargs in [ + ("Approach 1", + lambda: train_approach1(X_train_s, y_train_l, X_val_s, y_val_l, + n_classes=n_classes, epochs=int(epochs)), + lambda m: predict_approach1(m[0], X_test_s), + None), + ("Approach 2", + lambda: train_approach2(X_train_s, y_train_l, X_val_s, y_val_l, + X_flat_all=X_raw, feature_names=input_features, + lookback=lookback, train_size=train_size, + val_size=val_size, n_classes=n_classes, + epochs=int(epochs)), + lambda m: predict_approach2(m[0], X_test_s, X_raw, m[3], m[2], + lookback, train_size, val_size), + None), + ("Approach 3", + lambda: train_approach3(X_train_s, y_train_l, X_val_s, y_val_l, + n_classes=n_classes, epochs=int(epochs)), + lambda m: predict_approach3(m[0], X_test_s), + None), + ]: + try: + model_out = train_fn() + preds, proba = predict_fn(model_out) + results[approach] = execute_strategy( + preds, proba, y_test_r, test_dates, + target_etfs, fee_bps, tbill_rate, + ) + trained_info[approach] = {"proba": proba} + except Exception as e: + st.warning(f"β οΈ {approach} failed: {e}") + results[approach] = None + + pct = {"Approach 1": 33, "Approach 2": 66, "Approach 3": 100}[approach] + progress.progress(pct, text=f"{approach} done...") + + progress.empty() + + save_cache(cache_key, { + "results": results, "trained_info": trained_info, + "test_dates": list(test_dates), "test_slice": test_slice, + }) + + # ββ Persist to session state ββββββββββββββββββββββββββββββββββββββββββββββ + st.session_state.update({ + "results": results, "trained_info": trained_info, + "test_dates": test_dates, "test_slice": test_slice, + "optimal_lookback": optimal_lookback, "df_for_chart": df, + "tbill_rate": tbill_rate, "target_etfs": target_etfs, + "output_ready": True, }) -# ββ Select winner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +# ββ Render (persists across reruns via session_state) βββββββββββββββββββββββββ +if not st.session_state.output_ready: + st.info("π Configure parameters and click **π Run All 3 Approaches**.") + st.stop() + +results = st.session_state.results +trained_info = st.session_state.trained_info +test_dates = st.session_state.test_dates +test_slice = st.session_state.test_slice +optimal_lookback = st.session_state.optimal_lookback +df = st.session_state.df_for_chart +tbill_rate = st.session_state.tbill_rate +target_etfs = st.session_state.target_etfs + winner_name = select_winner(results) winner_res = results.get(winner_name) if winner_res is None: - st.error("β All approaches failed. Please check data and configuration.") + st.error("β All approaches failed.") st.stop() +if st.session_state.from_cache: + st.success("β‘ Showing cached results.") + next_date = get_next_signal_date() st.divider() -# ββ Winner signal banner ββββββββββββββββββββββββββββββββββββββββββββββββββββββ show_signal_banner(winner_res["next_signal"], next_date, winner_name) -# ββ Conviction panel ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ winner_proba = trained_info[winner_name]["proba"] -conviction = compute_conviction(winner_proba[-1], target_etfs, include_cash) +conviction = compute_conviction(winner_proba[-1], target_etfs, include_cash=False) show_conviction_panel(conviction) st.divider() -# ββ All models next day signals βββββββββββββββββββββββββββββββββββββββββββββββ all_signals = { - name: { - "signal": res["next_signal"], - "proba": trained_info[name]["proba"][-1], - "is_winner": name == winner_name, - } + name: {"signal": res["next_signal"], + "proba": trained_info[name]["proba"][-1], + "is_winner": name == winner_name} for name, res in results.items() if res is not None } -show_all_signals_panel(all_signals, target_etfs, include_cash, next_date, optimal_lookback) +show_all_signals_panel(all_signals, target_etfs, False, next_date, optimal_lookback) st.divider() - -# ββ Winner performance metrics ββββββββββββββββββββββββββββββββββββββββββββββββ st.subheader(f"π {winner_name} β Performance Metrics") -show_metrics_row(winner_res, tbill_rate) -st.divider() +# Build equity curve first to get spy_ann for metrics comparison +fig, spy_ann = equity_curve_chart(results, winner_name, test_dates, df, test_slice, tbill_rate) +show_metrics_row(winner_res, tbill_rate, spy_ann_return=spy_ann) -# ββ Comparison table ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +st.divider() st.subheader("π Approach Comparison (Winner = Highest Raw Annualised Return)") -comparison_df = build_comparison_table(results, winner_name) -show_comparison_table(comparison_df) +show_comparison_table(build_comparison_table(results, winner_name)) st.divider() - -# ββ Equity curve ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.subheader(f"π {winner_name} vs SPY & AGG β Out-of-Sample") -fig = equity_curve_chart(results, winner_name, test_dates, df, test_slice, tbill_rate) st.plotly_chart(fig, use_container_width=True) st.divider() - -# ββ Audit trail βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.subheader(f"π Audit Trail β {winner_name} (Last 20 Trading Days)") show_audit_trail(winner_res["audit_trail"]) diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py index 41b113eda1b094c1a0f6b40036d922934b8f2def..6b05505276c6be72f7860e645761efee6a3b9bf5 100644 --- a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py @@ -13,7 +13,9 @@ from data.loader import (load_dataset, check_data_freshness, get_features_and_targets, dataset_summary) from utils.calendar import get_est_time, get_next_signal_date from models.base import (build_sequences, train_val_test_split, - scale_features, returns_to_labels) + scale_features, returns_to_labels, + find_best_lookback, make_cache_key, + save_cache, load_cache) from models.approach1_wavelet import train_approach1, predict_approach1 from models.approach2_regime import train_approach2, predict_approach2 from models.approach3_multiscale import train_approach3, predict_approach3 @@ -39,8 +41,7 @@ with st.sidebar: start_yr = st.slider("π Start Year", 2010, 2024, 2016) fee_bps = st.slider("π° Fee (bps)", 0, 50, 10) - lookback = st.slider("π Lookback (days)", 20, 60, 30, step=5) - epochs = st.number_input("π Max Epochs", 20, 300, 100, step=10) + epochs = st.number_input("π Max Epochs", 20, 150, 80, step=10) st.divider() split_option = st.selectbox("π Train/Val/Test Split", ["70/15/15", "80/10/10"], index=0) @@ -109,7 +110,7 @@ st.info( f"**T-bill:** {tbill_rate*100:.2f}%" ) -# ββ Build sequences βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +# ββ Prepare raw arrays ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ X_raw = df[input_features].values.astype(np.float32) y_raw = df[target_etfs].values.astype(np.float32) @@ -117,39 +118,74 @@ for j in range(X_raw.shape[1]): mask = np.isnan(X_raw[:, j]) if mask.any(): X_raw[mask, j] = np.nanmean(X_raw[:, j]) - for j in range(y_raw.shape[1]): mask = np.isnan(y_raw[:, j]) if mask.any(): y_raw[mask, j] = np.nanmean(y_raw[:, j]) -X_seq, y_seq = build_sequences(X_raw, y_raw, lookback) -y_labels = returns_to_labels(y_seq, include_cash=include_cash) - -(X_train, y_train_r, X_val, y_val_r, - X_test, y_test_r) = train_val_test_split(X_seq, y_seq, train_pct, val_pct) -(_, y_train_l, _, y_val_l, - _, _) = train_val_test_split(X_seq, y_labels, train_pct, val_pct) - -X_train_s, X_val_s, X_test_s, _ = scale_features(X_train, X_val, X_test) - -train_size = len(X_train) -val_size = len(X_val) -test_start = lookback + train_size + val_size -test_dates = df.index[test_start: test_start + len(X_test)] -test_slice = slice(test_start, test_start + len(X_test)) - -st.success(f"β Sequences β Train: {train_size:,} Β· Val: {val_size:,} Β· Test: {len(X_test):,}") +# ββ Auto-select optimal lookback ββββββββββββββββββββββββββββββββββββββββββββββ +last_date_str = str(freshness.get("last_date_in_data", "unknown")) -# ββ Train all three approaches ββββββββββββββββββββββββββββββββββββββββββββββββ -results = {} -trained_info = {} -progress = st.progress(0, text="Starting training...") - -# Approach 1 -with st.spinner("π Training Approach 1 β Wavelet CNN-LSTM..."): +# Check cache for lookback selection too +lb_cache_key = make_cache_key( + last_date_str, start_yr, fee_bps, int(epochs), split_option, include_cash, 0 +) +lb_cached = load_cache(f"lb_{lb_cache_key}") + +if lb_cached is not None: + optimal_lookback = lb_cached["optimal_lookback"] + st.success(f"β‘ Loaded from cache Β· Optimal lookback: **{optimal_lookback}d**") +else: + with st.spinner("π Finding optimal lookback (30 / 45 / 60d)..."): + def _y_labels_fn(y_seq): + return returns_to_labels(y_seq, include_cash=include_cash) + optimal_lookback = find_best_lookback( + X_raw, y_raw, _y_labels_fn, + train_pct, val_pct, n_classes, include_cash, + candidates=[30, 45, 60], + ) + save_cache(f"lb_{lb_cache_key}", {"optimal_lookback": optimal_lookback}) + st.success(f"π Optimal lookback: **{optimal_lookback}d** (auto-selected from 30/45/60)") + +lookback = optimal_lookback + +# ββ Check full model cache ββββββββββββββββββββββββββββββββββββββββββββββββββββ +cache_key = make_cache_key(last_date_str, start_yr, fee_bps, int(epochs), + split_option, include_cash, lookback) +cached_data = load_cache(cache_key) +from_cache = cached_data is not None + +if from_cache: + results = cached_data["results"] + trained_info = cached_data["trained_info"] + test_dates = pd.DatetimeIndex(cached_data["test_dates"]) + test_slice = cached_data["test_slice"] + st.success("β‘ Results loaded from cache β no retraining needed.") +else: + # ββ Build sequences βββββββββββββββββββββββββββββββββββββββββββββββββββββββ + X_seq, y_seq = build_sequences(X_raw, y_raw, lookback) + y_labels = returns_to_labels(y_seq, include_cash=include_cash) + + (X_train, y_train_r, X_val, y_val_r, + X_test, y_test_r) = train_val_test_split(X_seq, y_seq, train_pct, val_pct) + (_, y_train_l, _, y_val_l, + _, _) = train_val_test_split(X_seq, y_labels, train_pct, val_pct) + + X_train_s, X_val_s, X_test_s, _ = scale_features(X_train, X_val, X_test) + + train_size = len(X_train) + val_size = len(X_val) + test_start = lookback + train_size + val_size + test_dates = df.index[test_start: test_start + len(X_test)] + test_slice = slice(test_start, test_start + len(X_test)) + + results = {} + trained_info = {} + progress = st.progress(0, text="Training Approach 1...") + + # ββ Approach 1 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ try: - model1, hist1, _ = train_approach1( + model1, _, _ = train_approach1( X_train_s, y_train_l, X_val_s, y_val_l, n_classes=n_classes, epochs=int(epochs), ) @@ -159,17 +195,15 @@ with st.spinner("π Training Approach 1 β Wavelet CNN-LSTM..."): target_etfs, fee_bps, tbill_rate, include_cash, ) trained_info["Approach 1"] = {"proba": proba1} - st.success("β Approach 1 complete") except Exception as e: st.warning(f"β οΈ Approach 1 failed: {e}") results["Approach 1"] = None -progress.progress(33, text="Approach 1 done...") + progress.progress(33, text="Training Approach 2...") -# Approach 2 -with st.spinner("π Training Approach 2 β Regime-Conditioned CNN-LSTM..."): + # ββ Approach 2 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ try: - model2, hist2, hmm2, regime_cols2 = train_approach2( + model2, _, hmm2, regime_cols2 = train_approach2( X_train_s, y_train_l, X_val_s, y_val_l, X_flat_all=X_raw, feature_names=input_features, lookback=lookback, train_size=train_size, val_size=val_size, @@ -184,17 +218,15 @@ with st.spinner("π Training Approach 2 β Regime-Conditioned CNN-LSTM..."): target_etfs, fee_bps, tbill_rate, include_cash, ) trained_info["Approach 2"] = {"proba": proba2} - st.success("β Approach 2 complete") except Exception as e: st.warning(f"β οΈ Approach 2 failed: {e}") results["Approach 2"] = None -progress.progress(66, text="Approach 2 done...") + progress.progress(66, text="Training Approach 3...") -# Approach 3 -with st.spinner("π‘ Training Approach 3 β Multi-Scale CNN-LSTM..."): + # ββ Approach 3 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ try: - model3, hist3 = train_approach3( + model3, _ = train_approach3( X_train_s, y_train_l, X_val_s, y_val_l, n_classes=n_classes, epochs=int(epochs), ) @@ -204,13 +236,20 @@ with st.spinner("π‘ Training Approach 3 β Multi-Scale CNN-LSTM..."): target_etfs, fee_bps, tbill_rate, include_cash, ) trained_info["Approach 3"] = {"proba": proba3} - st.success("β Approach 3 complete") except Exception as e: st.warning(f"β οΈ Approach 3 failed: {e}") results["Approach 3"] = None -progress.progress(100, text="All approaches complete!") -progress.empty() + progress.progress(100, text="Done!") + progress.empty() + + # ββ Save to cache βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + save_cache(cache_key, { + "results": results, + "trained_info": trained_info, + "test_dates": list(test_dates), + "test_slice": test_slice, + }) # ββ Select winner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ winner_name = select_winner(results) @@ -226,14 +265,14 @@ st.divider() # ββ Winner signal banner ββββββββββββββββββββββββββββββββββββββββββββββββββββββ show_signal_banner(winner_res["next_signal"], next_date, winner_name) -# ββ Conviction panel (winner only) ββββββββββββββββββββββββββββββββββββββββββββ +# ββ Conviction panel ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ winner_proba = trained_info[winner_name]["proba"] conviction = compute_conviction(winner_proba[-1], target_etfs, include_cash) show_conviction_panel(conviction) st.divider() -# ββ All models' next day signals ββββββββββββββββββββββββββββββββββββββββββββββ +# ββ All models next day signals βββββββββββββββββββββββββββββββββββββββββββββββ all_signals = { name: { "signal": res["next_signal"], @@ -242,7 +281,7 @@ all_signals = { } for name, res in results.items() if res is not None } -show_all_signals_panel(all_signals, target_etfs, include_cash, next_date) +show_all_signals_panel(all_signals, target_etfs, include_cash, next_date, optimal_lookback) st.divider() @@ -259,13 +298,13 @@ show_comparison_table(comparison_df) st.divider() -# ββ Equity curves βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ -st.subheader("π Out-of-Sample Equity Curves β All Approaches vs Benchmarks") +# ββ Equity curve ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +st.subheader(f"π {winner_name} vs SPY & AGG β Out-of-Sample") fig = equity_curve_chart(results, winner_name, test_dates, df, test_slice, tbill_rate) st.plotly_chart(fig, use_container_width=True) st.divider() -# ββ Audit trail (winner) ββββββββββββββββββββββββββββββββββββββββββββββββββββββ +# ββ Audit trail βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.subheader(f"π Audit Trail β {winner_name} (Last 20 Trading Days)") show_audit_trail(winner_res["audit_trail"]) diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py index d3e04d6d6eaed853e7143f18f88a5fb069c05d86..41b113eda1b094c1a0f6b40036d922934b8f2def 100644 --- a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py @@ -11,7 +11,7 @@ import numpy as np from data.loader import (load_dataset, check_data_freshness, get_features_and_targets, dataset_summary) -from utils.calendar import get_est_time, is_sync_window, get_next_signal_date +from utils.calendar import get_est_time, get_next_signal_date from models.base import (build_sequences, train_val_test_split, scale_features, returns_to_labels) from models.approach1_wavelet import train_approach1, predict_approach1 @@ -22,8 +22,9 @@ from signals.conviction import compute_conviction from ui.components import ( show_freshness_status, show_signal_banner, show_conviction_panel, show_metrics_row, show_comparison_table, show_audit_trail, + show_all_signals_panel, ) -from ui.charts import equity_curve_chart, comparison_bar_chart +from ui.charts import equity_curve_chart st.set_page_config(page_title="P2-ETF-CNN-LSTM", page_icon="π§ ", layout="wide") @@ -34,12 +35,8 @@ with st.sidebar: st.header("βοΈ Configuration") now_est = get_est_time() st.write(f"π **EST:** {now_est.strftime('%H:%M:%S')}") - if is_sync_window(): - st.success("β Sync Window Active") - else: - st.info("βΈοΈ Sync Window Inactive") - st.divider() + start_yr = st.slider("π Start Year", 2010, 2024, 2016) fee_bps = st.slider("π° Fee (bps)", 0, 50, 10) lookback = st.slider("π Lookback (days)", 20, 60, 30, step=5) @@ -87,9 +84,6 @@ with st.sidebar: st.write(f"**Macro:** {', '.join(summary['macro_found'])}") st.write(f"**T-bill col:** {'β ' if summary['tbill_found'] else 'β'}") - with st.expander("π All columns"): - st.write(summary["all_cols"]) - if not run_button: st.info("π Configure parameters and click **π Run All 3 Approaches**.") st.stop() @@ -101,7 +95,7 @@ st.write(f"π **Data:** {df.index[0].strftime('%Y-%m-%d')} β {df.index[-1].s # ββ Features & targets ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ try: - input_features, target_etfs, tbill_rate, df, col_info = get_features_and_targets(df) + input_features, target_etfs, tbill_rate, df, _ = get_features_and_targets(df) except ValueError as e: st.error(str(e)) st.stop() @@ -109,18 +103,6 @@ except ValueError as e: n_etfs = len(target_etfs) n_classes = n_etfs + (1 if include_cash else 0) -# ββ Show column detection diagnostics ββββββββββββββββββββββββββββββββββββββββ -with st.expander("π¬ Column detection diagnostics", expanded=False): - st.write("**How each ETF column was interpreted:**") - for col, info in col_info.items(): - st.write(f"- `{col}`: {info}") - st.write(f"**Input features ({len(input_features)}):** {input_features}") - st.write(f"**T-bill rate used:** {tbill_rate*100:.3f}%") - - # Show sample return values to verify correctness - st.write("**Sample target return values (last 3 rows):**") - st.dataframe(df[target_etfs].tail(3)) - st.info( f"π― **Targets:** {', '.join([t.replace('_Ret','') for t in target_etfs])} Β· " f"**Features:** {len(input_features)} signals Β· " @@ -131,19 +113,15 @@ st.info( X_raw = df[input_features].values.astype(np.float32) y_raw = df[target_etfs].values.astype(np.float32) -# Fill NaNs -col_means = np.nanmean(X_raw, axis=0) for j in range(X_raw.shape[1]): mask = np.isnan(X_raw[:, j]) if mask.any(): - X_raw[mask, j] = col_means[j] + X_raw[mask, j] = np.nanmean(X_raw[:, j]) -# Also fill NaNs in y_raw -y_means = np.nanmean(y_raw, axis=0) for j in range(y_raw.shape[1]): mask = np.isnan(y_raw[:, j]) if mask.any(): - y_raw[mask, j] = y_means[j] + y_raw[mask, j] = np.nanmean(y_raw[:, j]) X_seq, y_seq = build_sequences(X_raw, y_raw, lookback) y_labels = returns_to_labels(y_seq, include_cash=include_cash) @@ -151,7 +129,7 @@ y_labels = returns_to_labels(y_seq, include_cash=include_cash) (X_train, y_train_r, X_val, y_val_r, X_test, y_test_r) = train_val_test_split(X_seq, y_seq, train_pct, val_pct) (_, y_train_l, _, y_val_l, - _, y_test_l) = train_val_test_split(X_seq, y_labels, train_pct, val_pct) + _, _) = train_val_test_split(X_seq, y_labels, train_pct, val_pct) X_train_s, X_val_s, X_test_s, _ = scale_features(X_train, X_val, X_test) @@ -163,14 +141,6 @@ test_slice = slice(test_start, test_start + len(X_test)) st.success(f"β Sequences β Train: {train_size:,} Β· Val: {val_size:,} Β· Test: {len(X_test):,}") -# Show class distribution to check for degenerate labels -with st.expander("π¬ Label distribution (train set)", expanded=False): - unique, counts = np.unique(y_train_l, return_counts=True) - label_names = [target_etfs[i].replace("_Ret","") if i < n_etfs else "CASH" for i in unique] - dist_df = pd.DataFrame({"Class": label_names, "Count": counts, - "Pct": (counts / counts.sum() * 100).round(1)}) - st.dataframe(dist_df) - # ββ Train all three approaches ββββββββββββββββββββββββββββββββββββββββββββββββ results = {} trained_info = {} @@ -253,27 +223,49 @@ if winner_res is None: next_date = get_next_signal_date() st.divider() +# ββ Winner signal banner ββββββββββββββββββββββββββββββββββββββββββββββββββββββ show_signal_banner(winner_res["next_signal"], next_date, winner_name) +# ββ Conviction panel (winner only) ββββββββββββββββββββββββββββββββββββββββββββ winner_proba = trained_info[winner_name]["proba"] conviction = compute_conviction(winner_proba[-1], target_etfs, include_cash) show_conviction_panel(conviction) st.divider() + +# ββ All models' next day signals ββββββββββββββββββββββββββββββββββββββββββββββ +all_signals = { + name: { + "signal": res["next_signal"], + "proba": trained_info[name]["proba"][-1], + "is_winner": name == winner_name, + } + for name, res in results.items() if res is not None +} +show_all_signals_panel(all_signals, target_etfs, include_cash, next_date) + +st.divider() + +# ββ Winner performance metrics ββββββββββββββββββββββββββββββββββββββββββββββββ st.subheader(f"π {winner_name} β Performance Metrics") show_metrics_row(winner_res, tbill_rate) st.divider() + +# ββ Comparison table ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.subheader("π Approach Comparison (Winner = Highest Raw Annualised Return)") comparison_df = build_comparison_table(results, winner_name) show_comparison_table(comparison_df) -st.plotly_chart(comparison_bar_chart(results, winner_name), use_container_width=True) st.divider() + +# ββ Equity curves βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.subheader("π Out-of-Sample Equity Curves β All Approaches vs Benchmarks") fig = equity_curve_chart(results, winner_name, test_dates, df, test_slice, tbill_rate) st.plotly_chart(fig, use_container_width=True) st.divider() + +# ββ Audit trail (winner) ββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.subheader(f"π Audit Trail β {winner_name} (Last 20 Trading Days)") show_audit_trail(winner_res["audit_trail"]) diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py index f7484b3e44c030816ad8f2d236eeedf500b4b341..0f9fef47d7abc5d77c115965ba7fccaa081841a8 100644 --- a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py @@ -1,12 +1,8 @@ """ data/loader.py Loads master_data.parquet from HF Dataset. -Validates freshness against the last NYSE trading day. -No external pings β all data comes from HF Dataset only. - -Actual dataset columns (confirmed from parquet inspection): - ETFs : AGG, GLD, SLV, SPY, TBT, TLT, VNQ - Macro : VIX, DXY, T10Y2Y, TBILL_3M, IG_SPREAD, HY_SPREAD +Engineers rich feature set from raw price/macro columns. +No external pings β all data from HF Dataset only. """ import pandas as pd @@ -22,9 +18,8 @@ try: except ImportError: NYSE_CAL_AVAILABLE = False -DATASET_REPO = "P2SAMAPA/fi-etf-macro-signal-master-data" -PARQUET_FILE = "master_data.parquet" - +DATASET_REPO = "P2SAMAPA/fi-etf-macro-signal-master-data" +PARQUET_FILE = "master_data.parquet" TARGET_ETF_COLS = ["TLT", "TBT", "VNQ", "SLV", "GLD"] BENCHMARK_COLS = ["SPY", "AGG"] TBILL_COL = "TBILL_3M" @@ -64,16 +59,13 @@ def load_dataset(hf_token: str) -> pd.DataFrame: token=hf_token, ) df = pd.read_parquet(path) - if not isinstance(df.index, pd.DatetimeIndex): for col in ["Date", "date", "DATE"]: if col in df.columns: df = df.set_index(col) break df.index = pd.to_datetime(df.index) - return df.sort_index() - except Exception as e: st.error(f"β Failed to load dataset: {e}") return pd.DataFrame() @@ -84,11 +76,9 @@ def load_dataset(hf_token: str) -> pd.DataFrame: def check_data_freshness(df: pd.DataFrame) -> dict: if df.empty: return {"fresh": False, "message": "Dataset is empty."} - last = df.index[-1].date() expect = get_last_nyse_trading_day() fresh = last >= expect - msg = ( f"β Dataset up to date through **{last}**." if fresh else f"β οΈ **{expect}** data not yet updated. Latest: **{last}**. " @@ -98,106 +88,139 @@ def check_data_freshness(df: pd.DataFrame) -> dict: "expected_date": expect, "message": msg} -# ββ Detect whether a column holds prices or returns βββββββββββββββββββββββββββ +# ββ Price β returns βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ -def _is_price_series(series: pd.Series) -> bool: - """ - Heuristic: a price series has abs(median) > 2 and std/mean < 0.5. - A return series has abs(median) < 0.1 and many values near zero. - """ +def _to_returns(series: pd.Series) -> pd.Series: + """Convert price series to daily pct returns. If already returns, pass through.""" clean = series.dropna() if len(clean) == 0: - return False - med = abs(clean.median()) - # Strong price signal: median > 2 (e.g. TLT ~ 90, TBT ~ 20) - if med > 2: - return True - # Strong return signal: most values between -0.2 and 0.2 - if (clean.abs() < 0.2).mean() > 0.9: - return False - return med > 0.5 + return series + if abs(clean.median()) > 2: # price series + return series.pct_change() + return series # already returns + + +# ββ Feature engineering βββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def _engineer_features(df: pd.DataFrame, ret_cols: list) -> pd.DataFrame: + """ + Build a rich feature set from raw macro + ETF return columns. + + Features added per ETF return: + - 1d, 5d, 21d lagged returns + - 5d, 21d rolling volatility + - 5d, 21d momentum (cumulative return) + + Features added per macro column: + - raw value (z-scored over rolling 252d window) + - 5d change + - 1d lag + + Also adds: + - TBILL_3M as a feature (rate level) + - VIX regime flag (VIX > 25) + - Yield curve slope (already T10Y2Y) + - Cross-asset momentum: spread between TLT_ret and TBT_ret + """ + feat = pd.DataFrame(index=df.index) + + # ββ ETF return features βββββββββββββββββββββββββββββββββββββββββββββββββββ + for col in ret_cols: + r = df[col] + feat[f"{col}_lag1"] = r.shift(1) + feat[f"{col}_lag5"] = r.shift(5) + feat[f"{col}_lag21"] = r.shift(21) + feat[f"{col}_vol5"] = r.rolling(5).std() + feat[f"{col}_vol21"] = r.rolling(21).std() + feat[f"{col}_mom5"] = r.rolling(5).sum() + feat[f"{col}_mom21"] = r.rolling(21).sum() + + # ββ Macro features ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + for col in MACRO_COLS: + if col not in df.columns: + continue + s = df[col] + # Z-score over rolling 252-day window + roll_mean = s.rolling(252, min_periods=63).mean() + roll_std = s.rolling(252, min_periods=63).std() + feat[f"{col}_z"] = (s - roll_mean) / (roll_std + 1e-9) + feat[f"{col}_chg5"] = s.diff(5) + feat[f"{col}_lag1"] = s.shift(1) + + # ββ TBILL level βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + if TBILL_COL in df.columns: + tbill = df[TBILL_COL] + feat["TBILL_level"] = tbill + feat["TBILL_chg5"] = tbill.diff(5) + + # ββ Derived cross-asset signals βββββββββββββββββββββββββββββββββββββββββββ + if "TLT_Ret" in df.columns and "TBT_Ret" in df.columns: + feat["TLT_TBT_spread_mom5"] = ( + df["TLT_Ret"].rolling(5).sum() - df["TBT_Ret"].rolling(5).sum() + ) + + if "VIX" in df.columns: + feat["VIX_regime"] = (df["VIX"] > 25).astype(float) + feat["VIX_mom5"] = df["VIX"].diff(5) + + if "T10Y2Y" in df.columns: + feat["YC_inverted"] = (df["T10Y2Y"] < 0).astype(float) + if "IG_SPREAD" in df.columns and "HY_SPREAD" in df.columns: + feat["credit_ratio"] = df["HY_SPREAD"] / (df["IG_SPREAD"] + 1e-9) -# ββ Feature / target extraction βββββββββββββββββββββββββββββββββββββββββββββββ + return feat + + +# ββ Main extraction function ββββββββββββββββββββββββββββββββββββββββββββββββββ def get_features_and_targets(df: pd.DataFrame): """ - Build return columns for target ETFs and benchmarks. - Auto-detects whether source columns are prices or already returns. + Build return columns for target ETFs and engineer a rich feature set. Returns: input_features : list[str] target_etfs : list[str] e.g. ["TLT_Ret", ...] tbill_rate : float - df : DataFrame with _Ret columns added - col_info : dict of diagnostics for sidebar display + df_out : DataFrame with all columns + col_info : dict of diagnostics """ missing = [c for c in TARGET_ETF_COLS if c not in df.columns] if missing: raise ValueError( f"Missing ETF columns: {missing}. " - f"Found in dataset: {list(df.columns)}" + f"Found: {list(df.columns)}" ) col_info = {} - # ββ Build _Ret columns ββββββββββββββββββββββββββββββββββββββββββββββββββββ - def make_ret(col): + # ββ Build ETF return columns ββββββββββββββββββββββββββββββββββββββββββββββ + target_etfs = [] + for col in TARGET_ETF_COLS: ret_col = f"{col}_Ret" - if ret_col in df.columns: - col_info[col] = "pre-computed _Ret" - return ret_col - if _is_price_series(df[col]): - df[ret_col] = df[col].pct_change() - col_info[col] = f"priceβpct_change (median={df[col].median():.2f})" - else: - df[ret_col] = df[col] - col_info[col] = f"used as-is (median={df[col].median():.4f})" - return ret_col - - target_etfs = [make_ret(c) for c in TARGET_ETF_COLS] - benchmark_rets = [make_ret(c) for c in BENCHMARK_COLS if c in df.columns] - - # Drop NaN rows (first row from pct_change) + df[ret_col] = _to_returns(df[col]) + med = abs(df[col].dropna().median()) + col_info[col] = f"priceβpct_change (median={med:.2f})" if med > 2 else f"used as-is (median={med:.4f})" + target_etfs.append(ret_col) + + # ββ Build benchmark return columns ββββββββββββββββββββββββββββββββββββββββ + for col in BENCHMARK_COLS: + if col in df.columns: + df[f"{col}_Ret"] = _to_returns(df[col]) + + # ββ Drop NaN from first pct_change row ββββββββββββββββββββββββββββββββββββ df = df.dropna(subset=target_etfs).copy() - # Sanity check: target returns should be small daily values - for ret_col in target_etfs: - med = df[ret_col].abs().median() - if med > 0.1: - st.warning( - f"β οΈ {ret_col} has median absolute value {med:.4f} β " - f"these may not be daily returns. Check dataset column '{ret_col.replace('_Ret','')}'. " - f"Sample values: {df[ret_col].tail(3).values}" - ) - - # ββ Input features ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ - exclude = set( - TARGET_ETF_COLS + BENCHMARK_COLS + target_etfs + benchmark_rets + - [f"{c}_Ret" for c in BENCHMARK_COLS] + [TBILL_COL] - ) + # ββ Engineer features βββββββββββββββββββββββββββββββββββββββββββββββββββββ + feat_df = _engineer_features(df, target_etfs) + + # Merge features into df + for col in feat_df.columns: + df[col] = feat_df[col].values - # First try known macro columns - input_features = [c for c in MACRO_COLS if c in df.columns and c not in exclude] - - # Then add any engineered signal columns - extra = [ - c for c in df.columns - if c not in exclude - and c not in input_features - and any(k in c for k in ["_Z", "_Vol", "Regime", "YC_", "Credit_", - "Rates_", "VIX_", "Spread", "DXY", "T10Y", - "TBILL", "SOFR", "MOVE"]) - and pd.api.types.is_numeric_dtype(df[c]) - ] - input_features += extra - - # Fallback: all numeric non-excluded columns - if not input_features: - input_features = [ - c for c in df.columns - if c not in exclude and pd.api.types.is_numeric_dtype(df[c]) - ] + # Drop rows with NaN in features (from lags/rolling) + feat_cols = list(feat_df.columns) + df = df.dropna(subset=feat_cols).copy() # ββ T-bill rate βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ tbill_rate = 0.045 @@ -207,6 +230,14 @@ def get_features_and_targets(df: pd.DataFrame): v = float(raw.iloc[-1]) tbill_rate = v / 100 if v > 1 else v + # Input features = all engineered feature columns + exclude = set( + TARGET_ETF_COLS + BENCHMARK_COLS + target_etfs + + [f"{c}_Ret" for c in BENCHMARK_COLS] + [TBILL_COL] + + list(MACRO_COLS) + ) + input_features = [c for c in feat_cols if c not in exclude] + return input_features, target_etfs, tbill_rate, df, col_info diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py index 97ceca9077e0805a4127a483233c87465d782c6e..d3e04d6d6eaed853e7143f18f88a5fb069c05d86 100644 --- a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py @@ -9,7 +9,6 @@ import streamlit as st import pandas as pd import numpy as np -# ββ Module imports ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ from data.loader import (load_dataset, check_data_freshness, get_features_and_targets, dataset_summary) from utils.calendar import get_est_time, is_sync_window, get_next_signal_date @@ -26,20 +25,13 @@ from ui.components import ( ) from ui.charts import equity_curve_chart, comparison_bar_chart -# ββ Page config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ -st.set_page_config( - page_title="P2-ETF-CNN-LSTM", - page_icon="π§ ", - layout="wide", -) +st.set_page_config(page_title="P2-ETF-CNN-LSTM", page_icon="π§ ", layout="wide") -# ββ Secrets βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ HF_TOKEN = os.getenv("HF_TOKEN", "") # ββ Sidebar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ with st.sidebar: st.header("βοΈ Configuration") - now_est = get_est_time() st.write(f"π **EST:** {now_est.strftime('%H:%M:%S')}") if is_sync_window(): @@ -48,25 +40,19 @@ with st.sidebar: st.info("βΈοΈ Sync Window Inactive") st.divider() - start_yr = st.slider("π Start Year", 2010, 2024, 2016) fee_bps = st.slider("π° Fee (bps)", 0, 50, 10) lookback = st.slider("π Lookback (days)", 20, 60, 30, step=5) epochs = st.number_input("π Max Epochs", 20, 300, 100, step=10) st.divider() - split_option = st.selectbox("π Train/Val/Test Split", ["70/15/15", "80/10/10"], index=0) - split_map = {"70/15/15": (0.70, 0.15), "80/10/10": (0.80, 0.10)} - train_pct, val_pct = split_map[split_option] + train_pct, val_pct = {"70/15/15": (0.70, 0.15), "80/10/10": (0.80, 0.10)}[split_option] - include_cash = st.checkbox( - "π΅ Include CASH class", value=True, - help="Model can select CASH (earns T-bill rate) instead of any ETF", - ) + include_cash = st.checkbox("π΅ Include CASH class", value=True, + help="Model can select CASH (earns T-bill rate) instead of any ETF") st.divider() - run_button = st.button("π Run All 3 Approaches", type="primary", use_container_width=True) # ββ Title βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ @@ -74,9 +60,8 @@ st.title("π§ P2-ETF-CNN-LSTM") st.caption("Approach 1: Wavelet Β· Approach 2: Regime-Conditioned Β· Approach 3: Multi-Scale Parallel") st.caption("Winner selected by highest raw annualised return on out-of-sample test set.") -# ββ Token check βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ if not HF_TOKEN: - st.error("β HF_TOKEN secret not found. Add it to HF Space / GitHub secrets.") + st.error("β HF_TOKEN secret not found.") st.stop() # ββ Load dataset ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ @@ -86,11 +71,10 @@ with st.spinner("π‘ Loading dataset from HuggingFace..."): if df_raw.empty: st.stop() -# ββ Freshness check βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ freshness = check_data_freshness(df_raw) show_freshness_status(freshness) -# ββ Dataset summary in sidebar ββββββββββββββββββββββββββββββββββββββββββββββββ +# ββ Dataset info sidebar ββββββββββββββββββββββββββββββββββββββββββββββββββββββ with st.sidebar: st.divider() st.subheader("π¦ Dataset Info") @@ -103,21 +87,21 @@ with st.sidebar: st.write(f"**Macro:** {', '.join(summary['macro_found'])}") st.write(f"**T-bill col:** {'β ' if summary['tbill_found'] else 'β'}") -# ββ Wait for run button βββββββββββββββββββββββββββββββββββββββββββββββββββββββ + with st.expander("π All columns"): + st.write(summary["all_cols"]) + if not run_button: - st.info("π Configure parameters in the sidebar and click **π Run All 3 Approaches**.") + st.info("π Configure parameters and click **π Run All 3 Approaches**.") st.stop() # ββ Filter by start year ββββββββββββββββββββββββββββββββββββββββββββββββββββββ df = df_raw[df_raw.index.year >= start_yr].copy() -st.write( - f"π **Data:** {df.index[0].strftime('%Y-%m-%d')} β {df.index[-1].strftime('%Y-%m-%d')} " - f"({df.index[-1].year - df.index[0].year + 1} years)" -) +st.write(f"π **Data:** {df.index[0].strftime('%Y-%m-%d')} β {df.index[-1].strftime('%Y-%m-%d')} " + f"({df.index[-1].year - df.index[0].year + 1} years)") # ββ Features & targets ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ try: - input_features, target_etfs, tbill_rate, df = get_features_and_targets(df) + input_features, target_etfs, tbill_rate, df, col_info = get_features_and_targets(df) except ValueError as e: st.error(str(e)) st.stop() @@ -125,6 +109,18 @@ except ValueError as e: n_etfs = len(target_etfs) n_classes = n_etfs + (1 if include_cash else 0) +# ββ Show column detection diagnostics ββββββββββββββββββββββββββββββββββββββββ +with st.expander("π¬ Column detection diagnostics", expanded=False): + st.write("**How each ETF column was interpreted:**") + for col, info in col_info.items(): + st.write(f"- `{col}`: {info}") + st.write(f"**Input features ({len(input_features)}):** {input_features}") + st.write(f"**T-bill rate used:** {tbill_rate*100:.3f}%") + + # Show sample return values to verify correctness + st.write("**Sample target return values (last 3 rows):**") + st.dataframe(df[target_etfs].tail(3)) + st.info( f"π― **Targets:** {', '.join([t.replace('_Ret','') for t in target_etfs])} Β· " f"**Features:** {len(input_features)} signals Β· " @@ -135,13 +131,20 @@ st.info( X_raw = df[input_features].values.astype(np.float32) y_raw = df[target_etfs].values.astype(np.float32) -# Fill any remaining NaNs with column means +# Fill NaNs col_means = np.nanmean(X_raw, axis=0) for j in range(X_raw.shape[1]): mask = np.isnan(X_raw[:, j]) if mask.any(): X_raw[mask, j] = col_means[j] +# Also fill NaNs in y_raw +y_means = np.nanmean(y_raw, axis=0) +for j in range(y_raw.shape[1]): + mask = np.isnan(y_raw[:, j]) + if mask.any(): + y_raw[mask, j] = y_means[j] + X_seq, y_seq = build_sequences(X_raw, y_raw, lookback) y_labels = returns_to_labels(y_seq, include_cash=include_cash) @@ -154,27 +157,30 @@ X_train_s, X_val_s, X_test_s, _ = scale_features(X_train, X_val, X_test) train_size = len(X_train) val_size = len(X_val) - test_start = lookback + train_size + val_size test_dates = df.index[test_start: test_start + len(X_test)] test_slice = slice(test_start, test_start + len(X_test)) -st.success( - f"β Sequences β Train: {train_size:,} Β· Val: {val_size:,} Β· Test: {len(X_test):,}" -) +st.success(f"β Sequences β Train: {train_size:,} Β· Val: {val_size:,} Β· Test: {len(X_test):,}") + +# Show class distribution to check for degenerate labels +with st.expander("π¬ Label distribution (train set)", expanded=False): + unique, counts = np.unique(y_train_l, return_counts=True) + label_names = [target_etfs[i].replace("_Ret","") if i < n_etfs else "CASH" for i in unique] + dist_df = pd.DataFrame({"Class": label_names, "Count": counts, + "Pct": (counts / counts.sum() * 100).round(1)}) + st.dataframe(dist_df) # ββ Train all three approaches ββββββββββββββββββββββββββββββββββββββββββββββββ results = {} trained_info = {} +progress = st.progress(0, text="Starting training...") -progress = st.progress(0, text="Starting training...") - -# ββ Approach 1 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +# Approach 1 with st.spinner("π Training Approach 1 β Wavelet CNN-LSTM..."): try: model1, hist1, _ = train_approach1( - X_train_s, y_train_l, - X_val_s, y_val_l, + X_train_s, y_train_l, X_val_s, y_val_l, n_classes=n_classes, epochs=int(epochs), ) preds1, proba1 = predict_approach1(model1, X_test_s) @@ -190,17 +196,13 @@ with st.spinner("π Training Approach 1 β Wavelet CNN-LSTM..."): progress.progress(33, text="Approach 1 done...") -# ββ Approach 2 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +# Approach 2 with st.spinner("π Training Approach 2 β Regime-Conditioned CNN-LSTM..."): try: model2, hist2, hmm2, regime_cols2 = train_approach2( - X_train_s, y_train_l, - X_val_s, y_val_l, - X_flat_all=X_raw, - feature_names=input_features, - lookback=lookback, - train_size=train_size, - val_size=val_size, + X_train_s, y_train_l, X_val_s, y_val_l, + X_flat_all=X_raw, feature_names=input_features, + lookback=lookback, train_size=train_size, val_size=val_size, n_classes=n_classes, epochs=int(epochs), ) preds2, proba2 = predict_approach2( @@ -219,12 +221,11 @@ with st.spinner("π Training Approach 2 β Regime-Conditioned CNN-LSTM..."): progress.progress(66, text="Approach 2 done...") -# ββ Approach 3 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +# Approach 3 with st.spinner("π‘ Training Approach 3 β Multi-Scale CNN-LSTM..."): try: model3, hist3 = train_approach3( - X_train_s, y_train_l, - X_val_s, y_val_l, + X_train_s, y_train_l, X_val_s, y_val_l, n_classes=n_classes, epochs=int(epochs), ) preds3, proba3 = predict_approach3(model3, X_test_s) @@ -250,41 +251,29 @@ if winner_res is None: st.stop() next_date = get_next_signal_date() - st.divider() -# ββ Signal banner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ show_signal_banner(winner_res["next_signal"], next_date, winner_name) -# ββ Conviction panel ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ winner_proba = trained_info[winner_name]["proba"] conviction = compute_conviction(winner_proba[-1], target_etfs, include_cash) show_conviction_panel(conviction) st.divider() - -# ββ Winner metrics ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.subheader(f"π {winner_name} β Performance Metrics") show_metrics_row(winner_res, tbill_rate) st.divider() - -# ββ Comparison table ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.subheader("π Approach Comparison (Winner = Highest Raw Annualised Return)") comparison_df = build_comparison_table(results, winner_name) show_comparison_table(comparison_df) - st.plotly_chart(comparison_bar_chart(results, winner_name), use_container_width=True) st.divider() - -# ββ Equity curves βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.subheader("π Out-of-Sample Equity Curves β All Approaches vs Benchmarks") fig = equity_curve_chart(results, winner_name, test_dates, df, test_slice, tbill_rate) st.plotly_chart(fig, use_container_width=True) st.divider() - -# ββ Audit trail βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.subheader(f"π Audit Trail β {winner_name} (Last 20 Trading Days)") show_audit_trail(winner_res["audit_trail"]) diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py index ae2533008e483f244c7dd3a1e189bb64e4488a4f..f7484b3e44c030816ad8f2d236eeedf500b4b341 100644 --- a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py @@ -4,7 +4,7 @@ Loads master_data.parquet from HF Dataset. Validates freshness against the last NYSE trading day. No external pings β all data comes from HF Dataset only. -Actual dataset columns (from parquet inspection): +Actual dataset columns (confirmed from parquet inspection): ETFs : AGG, GLD, SLV, SPY, TBT, TLT, VNQ Macro : VIX, DXY, T10Y2Y, TBILL_3M, IG_SPREAD, HY_SPREAD """ @@ -15,7 +15,6 @@ import streamlit as st from huggingface_hub import hf_hub_download from datetime import datetime, timedelta import pytz -import os try: import pandas_market_calendars as mcal @@ -26,33 +25,27 @@ except ImportError: DATASET_REPO = "P2SAMAPA/fi-etf-macro-signal-master-data" PARQUET_FILE = "master_data.parquet" -# ββ Actual column names in the dataset βββββββββββββββββββββββββββββββββββββββ -TARGET_ETF_COLS = ["TLT", "TBT", "VNQ", "SLV", "GLD"] # traded ETFs -BENCHMARK_COLS = ["SPY", "AGG"] # chart only -TBILL_COL = "TBILL_3M" # 3m T-bill rate -MACRO_COLS = ["VIX", "DXY", "T10Y2Y", "IG_SPREAD", "HY_SPREAD"] +TARGET_ETF_COLS = ["TLT", "TBT", "VNQ", "SLV", "GLD"] +BENCHMARK_COLS = ["SPY", "AGG"] +TBILL_COL = "TBILL_3M" +MACRO_COLS = ["VIX", "DXY", "T10Y2Y", "IG_SPREAD", "HY_SPREAD"] -# ββ NYSE calendar helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββ +# ββ NYSE calendar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ def get_last_nyse_trading_day(as_of=None): - """Return the most recent NYSE trading day on or before as_of (default: today EST).""" est = pytz.timezone("US/Eastern") if as_of is None: as_of = datetime.now(est) today = as_of.date() - if NYSE_CAL_AVAILABLE: try: nyse = mcal.get_calendar("NYSE") - start = today - timedelta(days=10) - sched = nyse.schedule(start_date=start, end_date=today) + sched = nyse.schedule(start_date=today - timedelta(days=10), end_date=today) if len(sched) > 0: return sched.index[-1].date() except Exception: pass - - # Fallback: skip weekends candidate = today while candidate.weekday() >= 5: candidate -= timedelta(days=1) @@ -63,10 +56,6 @@ def get_last_nyse_trading_day(as_of=None): @st.cache_data(ttl=3600, show_spinner=False) def load_dataset(hf_token: str) -> pd.DataFrame: - """ - Download master_data.parquet from HF Dataset and return as DataFrame. - Cached for 1 hour. Index is parsed as DatetimeIndex. - """ try: path = hf_hub_download( repo_id=DATASET_REPO, @@ -76,7 +65,6 @@ def load_dataset(hf_token: str) -> pd.DataFrame: ) df = pd.read_parquet(path) - # Ensure DatetimeIndex if not isinstance(df.index, pd.DatetimeIndex): for col in ["Date", "date", "DATE"]: if col in df.columns: @@ -84,66 +72,66 @@ def load_dataset(hf_token: str) -> pd.DataFrame: break df.index = pd.to_datetime(df.index) - df = df.sort_index() - return df + return df.sort_index() except Exception as e: - st.error(f"β Failed to load dataset from HuggingFace: {e}") + st.error(f"β Failed to load dataset: {e}") return pd.DataFrame() # ββ Freshness check βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ def check_data_freshness(df: pd.DataFrame) -> dict: - """ - Check whether the dataset contains data for the last NYSE trading day. - """ if df.empty: - return { - "fresh": False, - "last_date_in_data": None, - "expected_date": None, - "message": "Dataset is empty.", - } - - last_date_in_data = df.index[-1].date() - expected_date = get_last_nyse_trading_day() - fresh = last_date_in_data >= expected_date - - if fresh: - message = f"β Dataset is up to date through **{last_date_in_data}**." - else: - message = ( - f"β οΈ **{expected_date}** data not yet updated in dataset. " - f"Latest available: **{last_date_in_data}**. " - f"Please check back later β the dataset updates daily after market close." - ) + return {"fresh": False, "message": "Dataset is empty."} - return { - "fresh": fresh, - "last_date_in_data": last_date_in_data, - "expected_date": expected_date, - "message": message, - } + last = df.index[-1].date() + expect = get_last_nyse_trading_day() + fresh = last >= expect + + msg = ( + f"β Dataset up to date through **{last}**." if fresh else + f"β οΈ **{expect}** data not yet updated. Latest: **{last}**. " + f"Dataset updates daily after market close." + ) + return {"fresh": fresh, "last_date_in_data": last, + "expected_date": expect, "message": msg} + + +# ββ Detect whether a column holds prices or returns βββββββββββββββββββββββββββ + +def _is_price_series(series: pd.Series) -> bool: + """ + Heuristic: a price series has abs(median) > 2 and std/mean < 0.5. + A return series has abs(median) < 0.1 and many values near zero. + """ + clean = series.dropna() + if len(clean) == 0: + return False + med = abs(clean.median()) + # Strong price signal: median > 2 (e.g. TLT ~ 90, TBT ~ 20) + if med > 2: + return True + # Strong return signal: most values between -0.2 and 0.2 + if (clean.abs() < 0.2).mean() > 0.9: + return False + return med > 0.5 # ββ Feature / target extraction βββββββββββββββββββββββββββββββββββββββββββββββ def get_features_and_targets(df: pd.DataFrame): """ - Extract input feature columns and target ETF return columns. - - The dataset stores raw price or return values directly under ticker names. - We compute daily log returns for target ETFs if they are not already returns. + Build return columns for target ETFs and benchmarks. + Auto-detects whether source columns are prices or already returns. Returns: - input_features : list of column names to use as model inputs - target_etfs : list of ETF column names (after return computation) - tbill_rate : latest 3m T-bill rate as float (annualised, e.g. 0.045) - df : DataFrame (possibly with new _Ret columns added) + input_features : list[str] + target_etfs : list[str] e.g. ["TLT_Ret", ...] + tbill_rate : float + df : DataFrame with _Ret columns added + col_info : dict of diagnostics for sidebar display """ - - # ββ Confirm target ETFs exist βββββββββββββββββββββββββββββββββββββββββββββ missing = [c for c in TARGET_ETF_COLS if c not in df.columns] if missing: raise ValueError( @@ -151,71 +139,75 @@ def get_features_and_targets(df: pd.DataFrame): f"Found in dataset: {list(df.columns)}" ) - # ββ Build return columns ββββββββββββββββββββββββββββββββββββββββββββββββββ - # If values look like prices (>5), compute pct returns. - # If they already look like small returns (<1 in abs), use as-is. - target_etfs = [] - for col in TARGET_ETF_COLS: - ret_col = f"{col}_Ret" - if ret_col not in df.columns: - sample = df[col].dropna() - if len(sample) > 0 and abs(sample.median()) > 1: - # Looks like price β compute pct change - df[ret_col] = df[col].pct_change() - else: - # Already returns - df[ret_col] = df[col] - target_etfs.append(ret_col) - - # Same for benchmarks - for col in BENCHMARK_COLS: - ret_col = f"{col}_Ret" - if ret_col not in df.columns and col in df.columns: - sample = df[col].dropna() - if len(sample) > 0 and abs(sample.median()) > 1: - df[ret_col] = df[col].pct_change() - else: - df[ret_col] = df[col] + col_info = {} - # Drop rows with NaN in target columns (first row after pct_change) - df = df.dropna(subset=target_etfs) + # ββ Build _Ret columns ββββββββββββββββββββββββββββββββββββββββββββββββββββ + def make_ret(col): + ret_col = f"{col}_Ret" + if ret_col in df.columns: + col_info[col] = "pre-computed _Ret" + return ret_col + if _is_price_series(df[col]): + df[ret_col] = df[col].pct_change() + col_info[col] = f"priceβpct_change (median={df[col].median():.2f})" + else: + df[ret_col] = df[col] + col_info[col] = f"used as-is (median={df[col].median():.4f})" + return ret_col + + target_etfs = [make_ret(c) for c in TARGET_ETF_COLS] + benchmark_rets = [make_ret(c) for c in BENCHMARK_COLS if c in df.columns] + + # Drop NaN rows (first row from pct_change) + df = df.dropna(subset=target_etfs).copy() + + # Sanity check: target returns should be small daily values + for ret_col in target_etfs: + med = df[ret_col].abs().median() + if med > 0.1: + st.warning( + f"β οΈ {ret_col} has median absolute value {med:.4f} β " + f"these may not be daily returns. Check dataset column '{ret_col.replace('_Ret','')}'. " + f"Sample values: {df[ret_col].tail(3).values}" + ) # ββ Input features ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ - # Use macro columns directly; exclude ETF price/return cols and benchmarks exclude = set( - TARGET_ETF_COLS + BENCHMARK_COLS + - target_etfs + - [f"{c}_Ret" for c in BENCHMARK_COLS] + - [TBILL_COL] + TARGET_ETF_COLS + BENCHMARK_COLS + target_etfs + benchmark_rets + + [f"{c}_Ret" for c in BENCHMARK_COLS] + [TBILL_COL] ) - input_features = [ + # First try known macro columns + input_features = [c for c in MACRO_COLS if c in df.columns and c not in exclude] + + # Then add any engineered signal columns + extra = [ c for c in df.columns if c not in exclude - and c in (MACRO_COLS + [ - col for col in df.columns - if any(k in col for k in ["_Z", "_Vol", "Regime", "YC_", "Credit_", - "Rates_", "VIX_", "Spread", "DXY", "T10Y"]) - ]) + and c not in input_features + and any(k in c for k in ["_Z", "_Vol", "Regime", "YC_", "Credit_", + "Rates_", "VIX_", "Spread", "DXY", "T10Y", + "TBILL", "SOFR", "MOVE"]) + and pd.api.types.is_numeric_dtype(df[c]) ] + input_features += extra - # Fallback: if none matched, use all non-excluded numeric columns + # Fallback: all numeric non-excluded columns if not input_features: input_features = [ c for c in df.columns - if c not in exclude - and pd.api.types.is_numeric_dtype(df[c]) + if c not in exclude and pd.api.types.is_numeric_dtype(df[c]) ] # ββ T-bill rate βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ - tbill_rate = 0.045 # default + tbill_rate = 0.045 if TBILL_COL in df.columns: raw = df[TBILL_COL].dropna() if len(raw) > 0: - last_val = float(raw.iloc[-1]) - tbill_rate = last_val / 100 if last_val > 1 else last_val + v = float(raw.iloc[-1]) + tbill_rate = v / 100 if v > 1 else v - return input_features, target_etfs, tbill_rate, df + return input_features, target_etfs, tbill_rate, df, col_info # ββ Dataset summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ @@ -228,8 +220,9 @@ def dataset_summary(df: pd.DataFrame) -> dict: "columns": len(df.columns), "start_date": df.index[0].strftime("%Y-%m-%d"), "end_date": df.index[-1].strftime("%Y-%m-%d"), - "etfs_found": [c for c in TARGET_ETF_COLS if c in df.columns], - "benchmarks": [c for c in BENCHMARK_COLS if c in df.columns], - "macro_found": [c for c in MACRO_COLS if c in df.columns], + "etfs_found": [c for c in TARGET_ETF_COLS if c in df.columns], + "benchmarks": [c for c in BENCHMARK_COLS if c in df.columns], + "macro_found": [c for c in MACRO_COLS if c in df.columns], "tbill_found": TBILL_COL in df.columns, + "all_cols": list(df.columns), } diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py index eded65d240364585407db8613bb5c2acdbe1d7d9..97ceca9077e0805a4127a483233c87465d782c6e 100644 --- a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py @@ -10,9 +10,11 @@ import pandas as pd import numpy as np # ββ Module imports ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ -from data.loader import load_dataset, check_data_freshness, get_features_and_targets, dataset_summary +from data.loader import (load_dataset, check_data_freshness, + get_features_and_targets, dataset_summary) from utils.calendar import get_est_time, is_sync_window, get_next_signal_date -from models.base import build_sequences, train_val_test_split, scale_features, returns_to_labels +from models.base import (build_sequences, train_val_test_split, + scale_features, returns_to_labels) from models.approach1_wavelet import train_approach1, predict_approach1 from models.approach2_regime import train_approach2, predict_approach2 from models.approach3_multiscale import train_approach3, predict_approach3 @@ -47,10 +49,10 @@ with st.sidebar: st.divider() - start_yr = st.slider("π Start Year", 2010, 2024, 2016) - fee_bps = st.slider("π° Fee (bps)", 0, 50, 10) - lookback = st.slider("π Lookback (days)", 20, 60, 30, step=5) - epochs = st.number_input("π Max Epochs", 20, 300, 100, step=10) + start_yr = st.slider("π Start Year", 2010, 2024, 2016) + fee_bps = st.slider("π° Fee (bps)", 0, 50, 10) + lookback = st.slider("π Lookback (days)", 20, 60, 30, step=5) + epochs = st.number_input("π Max Epochs", 20, 300, 100, step=10) st.divider() @@ -58,8 +60,10 @@ with st.sidebar: split_map = {"70/15/15": (0.70, 0.15), "80/10/10": (0.80, 0.10)} train_pct, val_pct = split_map[split_option] - include_cash = st.checkbox("π΅ Include CASH class", value=True, - help="Model can select CASH (earns T-bill rate) as an alternative to any ETF") + include_cash = st.checkbox( + "π΅ Include CASH class", value=True, + help="Model can select CASH (earns T-bill rate) instead of any ETF", + ) st.divider() @@ -70,90 +74,102 @@ st.title("π§ P2-ETF-CNN-LSTM") st.caption("Approach 1: Wavelet Β· Approach 2: Regime-Conditioned Β· Approach 3: Multi-Scale Parallel") st.caption("Winner selected by highest raw annualised return on out-of-sample test set.") -# ββ Load data (always, to check freshness) ββββββββββββββββββββββββββββββββββββ +# ββ Token check βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ if not HF_TOKEN: - st.error("β HF_TOKEN secret not found. Please add it to your HF Space / GitHub secrets.") + st.error("β HF_TOKEN secret not found. Add it to HF Space / GitHub secrets.") st.stop() +# ββ Load dataset ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ with st.spinner("π‘ Loading dataset from HuggingFace..."): - df = load_dataset(HF_TOKEN) + df_raw = load_dataset(HF_TOKEN) -if df.empty: +if df_raw.empty: st.stop() # ββ Freshness check βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ -freshness = check_data_freshness(df) +freshness = check_data_freshness(df_raw) show_freshness_status(freshness) # ββ Dataset summary in sidebar ββββββββββββββββββββββββββββββββββββββββββββββββ with st.sidebar: st.divider() st.subheader("π¦ Dataset Info") - summary = dataset_summary(df) + summary = dataset_summary(df_raw) if summary: st.write(f"**Rows:** {summary['rows']:,}") st.write(f"**Range:** {summary['start_date']} β {summary['end_date']}") - st.write(f"**ETFs:** {', '.join([e.replace('_Ret','') for e in summary['etfs_found']])}") - st.write(f"**Benchmarks:** {', '.join([b.replace('_Ret','') for b in summary['benchmarks']])}") + st.write(f"**ETFs:** {', '.join(summary['etfs_found'])}") + st.write(f"**Benchmarks:** {', '.join(summary['benchmarks'])}") + st.write(f"**Macro:** {', '.join(summary['macro_found'])}") st.write(f"**T-bill col:** {'β ' if summary['tbill_found'] else 'β'}") -# ββ Main execution ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +# ββ Wait for run button βββββββββββββββββββββββββββββββββββββββββββββββββββββββ if not run_button: - st.info("π Configure parameters in the sidebar and click **π Run All 3 Approaches** to begin.") + st.info("π Configure parameters in the sidebar and click **π Run All 3 Approaches**.") st.stop() # ββ Filter by start year ββββββββββββββββββββββββββββββββββββββββββββββββββββββ -df = df[df.index.year >= start_yr].copy() -st.write(f"π **Data:** {df.index[0].strftime('%Y-%m-%d')} β {df.index[-1].strftime('%Y-%m-%d')} " - f"({df.index[-1].year - df.index[0].year + 1} years)") +df = df_raw[df_raw.index.year >= start_yr].copy() +st.write( + f"π **Data:** {df.index[0].strftime('%Y-%m-%d')} β {df.index[-1].strftime('%Y-%m-%d')} " + f"({df.index[-1].year - df.index[0].year + 1} years)" +) -# ββ Feature / target extraction βββββββββββββββββββββββββββββββββββββββββββββββ +# ββ Features & targets ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ try: - input_features, target_etfs, tbill_rate = get_features_and_targets(df) + input_features, target_etfs, tbill_rate, df = get_features_and_targets(df) except ValueError as e: st.error(str(e)) st.stop() -st.info(f"π― **Targets:** {len(target_etfs)} ETFs Β· **Features:** {len(input_features)} signals Β· " - f"**T-bill rate:** {tbill_rate*100:.2f}%") +n_etfs = len(target_etfs) +n_classes = n_etfs + (1 if include_cash else 0) + +st.info( + f"π― **Targets:** {', '.join([t.replace('_Ret','') for t in target_etfs])} Β· " + f"**Features:** {len(input_features)} signals Β· " + f"**T-bill:** {tbill_rate*100:.2f}%" +) -# ββ Prepare sequences βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ -X_raw = df[input_features].values.astype(np.float32) -y_raw = df[target_etfs].values.astype(np.float32) -n_etfs = len(target_etfs) -n_classes = n_etfs + (1 if include_cash else 0) # +1 for CASH +# ββ Build sequences βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +X_raw = df[input_features].values.astype(np.float32) +y_raw = df[target_etfs].values.astype(np.float32) -# Fill NaNs with column means +# Fill any remaining NaNs with column means col_means = np.nanmean(X_raw, axis=0) for j in range(X_raw.shape[1]): mask = np.isnan(X_raw[:, j]) - X_raw[mask, j] = col_means[j] + if mask.any(): + X_raw[mask, j] = col_means[j] X_seq, y_seq = build_sequences(X_raw, y_raw, lookback) y_labels = returns_to_labels(y_seq, include_cash=include_cash) -X_train, y_train_r, X_val, y_val_r, X_test, y_test_r = train_val_test_split(X_seq, y_seq, train_pct, val_pct) -_, y_train_l, _, y_val_l, _, y_test_l = train_val_test_split(X_seq, y_labels, train_pct, val_pct) +(X_train, y_train_r, X_val, y_val_r, + X_test, y_test_r) = train_val_test_split(X_seq, y_seq, train_pct, val_pct) +(_, y_train_l, _, y_val_l, + _, y_test_l) = train_val_test_split(X_seq, y_labels, train_pct, val_pct) X_train_s, X_val_s, X_test_s, _ = scale_features(X_train, X_val, X_test) train_size = len(X_train) val_size = len(X_val) -# Test dates (aligned with y_test) -test_start = lookback + train_size + val_size -test_dates = df.index[test_start: test_start + len(X_test)] -test_slice = slice(test_start, test_start + len(X_test)) +test_start = lookback + train_size + val_size +test_dates = df.index[test_start: test_start + len(X_test)] +test_slice = slice(test_start, test_start + len(X_test)) -st.success(f"β Sequences β Train: {train_size} Β· Val: {val_size} Β· Test: {len(X_test)}") +st.success( + f"β Sequences β Train: {train_size:,} Β· Val: {val_size:,} Β· Test: {len(X_test):,}" +) # ββ Train all three approaches ββββββββββββββββββββββββββββββββββββββββββββββββ results = {} -trained_info = {} # store extra info needed for conviction +trained_info = {} progress = st.progress(0, text="Starting training...") -# ββ Approach 1: Wavelet βββββββββββββββββββββββββββββββββββββββββββββββββββββββ +# ββ Approach 1 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ with st.spinner("π Training Approach 1 β Wavelet CNN-LSTM..."): try: model1, hist1, _ = train_approach1( @@ -163,7 +179,8 @@ with st.spinner("π Training Approach 1 β Wavelet CNN-LSTM..."): ) preds1, proba1 = predict_approach1(model1, X_test_s) results["Approach 1"] = execute_strategy( - preds1, proba1, y_test_r, test_dates, target_etfs, fee_bps, tbill_rate, include_cash, + preds1, proba1, y_test_r, test_dates, + target_etfs, fee_bps, tbill_rate, include_cash, ) trained_info["Approach 1"] = {"proba": proba1} st.success("β Approach 1 complete") @@ -173,7 +190,7 @@ with st.spinner("π Training Approach 1 β Wavelet CNN-LSTM..."): progress.progress(33, text="Approach 1 done...") -# ββ Approach 2: Regime-Conditioned βββββββββββββββββββββββββββββββββββββββββββ +# ββ Approach 2 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ with st.spinner("π Training Approach 2 β Regime-Conditioned CNN-LSTM..."): try: model2, hist2, hmm2, regime_cols2 = train_approach2( @@ -191,7 +208,8 @@ with st.spinner("π Training Approach 2 β Regime-Conditioned CNN-LSTM..."): lookback, train_size, val_size, ) results["Approach 2"] = execute_strategy( - preds2, proba2, y_test_r, test_dates, target_etfs, fee_bps, tbill_rate, include_cash, + preds2, proba2, y_test_r, test_dates, + target_etfs, fee_bps, tbill_rate, include_cash, ) trained_info["Approach 2"] = {"proba": proba2} st.success("β Approach 2 complete") @@ -201,7 +219,7 @@ with st.spinner("π Training Approach 2 β Regime-Conditioned CNN-LSTM..."): progress.progress(66, text="Approach 2 done...") -# ββ Approach 3: Multi-Scale βββββββββββββββββββββββββββββββββββββββββββββββββββ +# ββ Approach 3 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ with st.spinner("π‘ Training Approach 3 β Multi-Scale CNN-LSTM..."): try: model3, hist3 = train_approach3( @@ -211,7 +229,8 @@ with st.spinner("π‘ Training Approach 3 β Multi-Scale CNN-LSTM..."): ) preds3, proba3 = predict_approach3(model3, X_test_s) results["Approach 3"] = execute_strategy( - preds3, proba3, y_test_r, test_dates, target_etfs, fee_bps, tbill_rate, include_cash, + preds3, proba3, y_test_r, test_dates, + target_etfs, fee_bps, tbill_rate, include_cash, ) trained_info["Approach 3"] = {"proba": proba3} st.success("β Approach 3 complete") @@ -227,15 +246,14 @@ winner_name = select_winner(results) winner_res = results.get(winner_name) if winner_res is None: - st.error("β All approaches failed. Please check your data and configuration.") + st.error("β All approaches failed. Please check data and configuration.") st.stop() -# ββ Next trading date βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ next_date = get_next_signal_date() st.divider() -# ββ Signal banner (winner) ββββββββββββββββββββββββββββββββββββββββββββββββββββ +# ββ Signal banner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ show_signal_banner(winner_res["next_signal"], next_date, winner_name) # ββ Conviction panel ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ @@ -256,7 +274,6 @@ st.subheader("π Approach Comparison (Winner = Highest Raw Annualised Return)" comparison_df = build_comparison_table(results, winner_name) show_comparison_table(comparison_df) -# ββ Comparison bar chart ββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.plotly_chart(comparison_bar_chart(results, winner_name), use_container_width=True) st.divider() @@ -268,6 +285,6 @@ st.plotly_chart(fig, use_container_width=True) st.divider() -# ββ Audit trail (winner) ββββββββββββββββββββββββββββββββββββββββββββββββββββββ +# ββ Audit trail βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ st.subheader(f"π Audit Trail β {winner_name} (Last 20 Trading Days)") show_audit_trail(winner_res["audit_trail"]) diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py index b96ee711d680e273094523bcd220f93b06eb486d..ae2533008e483f244c7dd3a1e189bb64e4488a4f 100644 --- a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py @@ -3,6 +3,10 @@ data/loader.py Loads master_data.parquet from HF Dataset. Validates freshness against the last NYSE trading day. No external pings β all data comes from HF Dataset only. + +Actual dataset columns (from parquet inspection): + ETFs : AGG, GLD, SLV, SPY, TBT, TLT, VNQ + Macro : VIX, DXY, T10Y2Y, TBILL_3M, IG_SPREAD, HY_SPREAD """ import pandas as pd @@ -22,31 +26,29 @@ except ImportError: DATASET_REPO = "P2SAMAPA/fi-etf-macro-signal-master-data" PARQUET_FILE = "master_data.parquet" -# Columns expected in the dataset -REQUIRED_ETF_COLS = ["TLT_Ret", "TBT_Ret", "VNQ_Ret", "SLV_Ret", "GLD_Ret"] -BENCHMARK_COLS = ["SPY_Ret", "AGG_Ret"] -TBILL_COL = "DTB3" # 3m T-bill column in HF dataset -TARGET_ETFS = REQUIRED_ETF_COLS # 5 targets (no CASH in returns, CASH handled in strategy) +# ββ Actual column names in the dataset βββββββββββββββββββββββββββββββββββββββ +TARGET_ETF_COLS = ["TLT", "TBT", "VNQ", "SLV", "GLD"] # traded ETFs +BENCHMARK_COLS = ["SPY", "AGG"] # chart only +TBILL_COL = "TBILL_3M" # 3m T-bill rate +MACRO_COLS = ["VIX", "DXY", "T10Y2Y", "IG_SPREAD", "HY_SPREAD"] # ββ NYSE calendar helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββ -def get_last_nyse_trading_day(as_of: datetime = None) -> datetime.date: - """Return the most recent NYSE trading day before or on as_of (default: today EST).""" +def get_last_nyse_trading_day(as_of=None): + """Return the most recent NYSE trading day on or before as_of (default: today EST).""" est = pytz.timezone("US/Eastern") if as_of is None: as_of = datetime.now(est) - today = as_of.date() if NYSE_CAL_AVAILABLE: try: - nyse = mcal.get_calendar("NYSE") - # Look back up to 10 days to find last trading day + nyse = mcal.get_calendar("NYSE") start = today - timedelta(days=10) - schedule = nyse.schedule(start_date=start, end_date=today) - if len(schedule) > 0: - return schedule.index[-1].date() + sched = nyse.schedule(start_date=start, end_date=today) + if len(sched) > 0: + return sched.index[-1].date() except Exception: pass @@ -57,18 +59,6 @@ def get_last_nyse_trading_day(as_of: datetime = None) -> datetime.date: return candidate -def is_nyse_trading_day(date) -> bool: - """Return True if date is a NYSE trading day.""" - if NYSE_CAL_AVAILABLE: - try: - nyse = mcal.get_calendar("NYSE") - schedule = nyse.schedule(start_date=date, end_date=date) - return len(schedule) > 0 - except Exception: - pass - return date.weekday() < 5 - - # ββ Data loading ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ @st.cache_data(ttl=3600, show_spinner=False) @@ -88,10 +78,10 @@ def load_dataset(hf_token: str) -> pd.DataFrame: # Ensure DatetimeIndex if not isinstance(df.index, pd.DatetimeIndex): - if "Date" in df.columns: - df = df.set_index("Date") - elif "date" in df.columns: - df = df.set_index("date") + for col in ["Date", "date", "DATE"]: + if col in df.columns: + df = df.set_index(col) + break df.index = pd.to_datetime(df.index) df = df.sort_index() @@ -107,14 +97,6 @@ def load_dataset(hf_token: str) -> pd.DataFrame: def check_data_freshness(df: pd.DataFrame) -> dict: """ Check whether the dataset contains data for the last NYSE trading day. - - Returns a dict: - { - "fresh": bool, - "last_date_in_data": date, - "expected_date": date, - "message": str - } """ if df.empty: return { @@ -126,8 +108,7 @@ def check_data_freshness(df: pd.DataFrame) -> dict: last_date_in_data = df.index[-1].date() expected_date = get_last_nyse_trading_day() - - fresh = last_date_in_data >= expected_date + fresh = last_date_in_data >= expected_date if fresh: message = f"β Dataset is up to date through **{last_date_in_data}**." @@ -150,66 +131,105 @@ def check_data_freshness(df: pd.DataFrame) -> dict: def get_features_and_targets(df: pd.DataFrame): """ - Extract input feature columns and target ETF return columns from the dataset. + Extract input feature columns and target ETF return columns. + + The dataset stores raw price or return values directly under ticker names. + We compute daily log returns for target ETFs if they are not already returns. Returns: - input_features : list of column names - target_etfs : list of ETF return column names (e.g. TLT_Ret) - tbill_rate : latest 3m T-bill rate as a float (annualised, e.g. 0.045) + input_features : list of column names to use as model inputs + target_etfs : list of ETF column names (after return computation) + tbill_rate : latest 3m T-bill rate as float (annualised, e.g. 0.045) + df : DataFrame (possibly with new _Ret columns added) """ - # Target ETF return columns - target_etfs = [c for c in REQUIRED_ETF_COLS if c in df.columns] - if not target_etfs: + # ββ Confirm target ETFs exist βββββββββββββββββββββββββββββββββββββββββββββ + missing = [c for c in TARGET_ETF_COLS if c not in df.columns] + if missing: raise ValueError( - f"No target ETF columns found. Expected: {REQUIRED_ETF_COLS}. " + f"Missing ETF columns: {missing}. " f"Found in dataset: {list(df.columns)}" ) - # Input features: Z-scores, vol, regime, yield curve, credit, rates, VIX terms - exclude = set(target_etfs + BENCHMARK_COLS + [TBILL_COL]) + # ββ Build return columns ββββββββββββββββββββββββββββββββββββββββββββββββββ + # If values look like prices (>5), compute pct returns. + # If they already look like small returns (<1 in abs), use as-is. + target_etfs = [] + for col in TARGET_ETF_COLS: + ret_col = f"{col}_Ret" + if ret_col not in df.columns: + sample = df[col].dropna() + if len(sample) > 0 and abs(sample.median()) > 1: + # Looks like price β compute pct change + df[ret_col] = df[col].pct_change() + else: + # Already returns + df[ret_col] = df[col] + target_etfs.append(ret_col) + + # Same for benchmarks + for col in BENCHMARK_COLS: + ret_col = f"{col}_Ret" + if ret_col not in df.columns and col in df.columns: + sample = df[col].dropna() + if len(sample) > 0 and abs(sample.median()) > 1: + df[ret_col] = df[col].pct_change() + else: + df[ret_col] = df[col] + + # Drop rows with NaN in target columns (first row after pct_change) + df = df.dropna(subset=target_etfs) + + # ββ Input features ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + # Use macro columns directly; exclude ETF price/return cols and benchmarks + exclude = set( + TARGET_ETF_COLS + BENCHMARK_COLS + + target_etfs + + [f"{c}_Ret" for c in BENCHMARK_COLS] + + [TBILL_COL] + ) + input_features = [ c for c in df.columns if c not in exclude - and ( - c.endswith("_Z") - or c.endswith("_Vol") - or "Regime" in c - or "YC_" in c - or "Credit_" in c - or "Rates_" in c - or "VIX_" in c - or "Spread" in c - or "DXY" in c - or "VIX" in c - or "T10Y" in c - ) + and c in (MACRO_COLS + [ + col for col in df.columns + if any(k in col for k in ["_Z", "_Vol", "Regime", "YC_", "Credit_", + "Rates_", "VIX_", "Spread", "DXY", "T10Y"]) + ]) ] - # 3m T-bill rate (for CASH return & Sharpe) - tbill_rate = 0.045 # default fallback + # Fallback: if none matched, use all non-excluded numeric columns + if not input_features: + input_features = [ + c for c in df.columns + if c not in exclude + and pd.api.types.is_numeric_dtype(df[c]) + ] + + # ββ T-bill rate βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + tbill_rate = 0.045 # default if TBILL_COL in df.columns: raw = df[TBILL_COL].dropna() if len(raw) > 0: - last_val = raw.iloc[-1] - # DTB3 is typically in percent (e.g. 5.25 means 5.25%) - tbill_rate = float(last_val) / 100 if last_val > 1 else float(last_val) + last_val = float(raw.iloc[-1]) + tbill_rate = last_val / 100 if last_val > 1 else last_val - return input_features, target_etfs, tbill_rate + return input_features, target_etfs, tbill_rate, df -# ββ Column info helper (for sidebar display) ββββββββββββββββββββββββββββββββββ +# ββ Dataset summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ def dataset_summary(df: pd.DataFrame) -> dict: - """Return a brief summary dict for sidebar display.""" if df.empty: return {} return { - "rows": len(df), - "columns": len(df.columns), - "start_date": df.index[0].strftime("%Y-%m-%d"), - "end_date": df.index[-1].strftime("%Y-%m-%d"), - "etfs_found": [c for c in REQUIRED_ETF_COLS if c in df.columns], - "benchmarks": [c for c in BENCHMARK_COLS if c in df.columns], + "rows": len(df), + "columns": len(df.columns), + "start_date": df.index[0].strftime("%Y-%m-%d"), + "end_date": df.index[-1].strftime("%Y-%m-%d"), + "etfs_found": [c for c in TARGET_ETF_COLS if c in df.columns], + "benchmarks": [c for c in BENCHMARK_COLS if c in df.columns], + "macro_found": [c for c in MACRO_COLS if c in df.columns], "tbill_found": TBILL_COL in df.columns, } diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/data/__init__.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/data/__init__.py @@ -0,0 +1 @@ + diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md index 76b8435744e41334f32c6f1ad441d3b27fd29b2c..abfc7cb8aefe0be318b035c033e2ef83e1f2e0bf 100644 --- a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md @@ -1,3 +1,15 @@ +--- +title: P2-ETF-CNN-LSTM-ALTERNATIVE-APPROACHES +emoji: π§ +colorFrom: green +colorTo: blue +sdk: streamlit +sdk_version: "1.32.0" +python_version: "3.10" +app_file: app.py +pinned: false +--- + # P2-ETF-CNN-LSTM-ALTERNATIVE-APPROACHES Macro-driven ETF rotation using three augmented CNN-LSTM variants. diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/__init__.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/__init__.py @@ -0,0 +1 @@ + diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..b96ee711d680e273094523bcd220f93b06eb486d --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/data/loader.py @@ -0,0 +1,215 @@ +""" +data/loader.py +Loads master_data.parquet from HF Dataset. +Validates freshness against the last NYSE trading day. +No external pings β all data comes from HF Dataset only. +""" + +import pandas as pd +import numpy as np +import streamlit as st +from huggingface_hub import hf_hub_download +from datetime import datetime, timedelta +import pytz +import os + +try: + import pandas_market_calendars as mcal + NYSE_CAL_AVAILABLE = True +except ImportError: + NYSE_CAL_AVAILABLE = False + +DATASET_REPO = "P2SAMAPA/fi-etf-macro-signal-master-data" +PARQUET_FILE = "master_data.parquet" + +# Columns expected in the dataset +REQUIRED_ETF_COLS = ["TLT_Ret", "TBT_Ret", "VNQ_Ret", "SLV_Ret", "GLD_Ret"] +BENCHMARK_COLS = ["SPY_Ret", "AGG_Ret"] +TBILL_COL = "DTB3" # 3m T-bill column in HF dataset +TARGET_ETFS = REQUIRED_ETF_COLS # 5 targets (no CASH in returns, CASH handled in strategy) + + +# ββ NYSE calendar helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def get_last_nyse_trading_day(as_of: datetime = None) -> datetime.date: + """Return the most recent NYSE trading day before or on as_of (default: today EST).""" + est = pytz.timezone("US/Eastern") + if as_of is None: + as_of = datetime.now(est) + + today = as_of.date() + + if NYSE_CAL_AVAILABLE: + try: + nyse = mcal.get_calendar("NYSE") + # Look back up to 10 days to find last trading day + start = today - timedelta(days=10) + schedule = nyse.schedule(start_date=start, end_date=today) + if len(schedule) > 0: + return schedule.index[-1].date() + except Exception: + pass + + # Fallback: skip weekends + candidate = today + while candidate.weekday() >= 5: + candidate -= timedelta(days=1) + return candidate + + +def is_nyse_trading_day(date) -> bool: + """Return True if date is a NYSE trading day.""" + if NYSE_CAL_AVAILABLE: + try: + nyse = mcal.get_calendar("NYSE") + schedule = nyse.schedule(start_date=date, end_date=date) + return len(schedule) > 0 + except Exception: + pass + return date.weekday() < 5 + + +# ββ Data loading ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +@st.cache_data(ttl=3600, show_spinner=False) +def load_dataset(hf_token: str) -> pd.DataFrame: + """ + Download master_data.parquet from HF Dataset and return as DataFrame. + Cached for 1 hour. Index is parsed as DatetimeIndex. + """ + try: + path = hf_hub_download( + repo_id=DATASET_REPO, + filename=PARQUET_FILE, + repo_type="dataset", + token=hf_token, + ) + df = pd.read_parquet(path) + + # Ensure DatetimeIndex + if not isinstance(df.index, pd.DatetimeIndex): + if "Date" in df.columns: + df = df.set_index("Date") + elif "date" in df.columns: + df = df.set_index("date") + df.index = pd.to_datetime(df.index) + + df = df.sort_index() + return df + + except Exception as e: + st.error(f"β Failed to load dataset from HuggingFace: {e}") + return pd.DataFrame() + + +# ββ Freshness check βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def check_data_freshness(df: pd.DataFrame) -> dict: + """ + Check whether the dataset contains data for the last NYSE trading day. + + Returns a dict: + { + "fresh": bool, + "last_date_in_data": date, + "expected_date": date, + "message": str + } + """ + if df.empty: + return { + "fresh": False, + "last_date_in_data": None, + "expected_date": None, + "message": "Dataset is empty.", + } + + last_date_in_data = df.index[-1].date() + expected_date = get_last_nyse_trading_day() + + fresh = last_date_in_data >= expected_date + + if fresh: + message = f"β Dataset is up to date through **{last_date_in_data}**." + else: + message = ( + f"β οΈ **{expected_date}** data not yet updated in dataset. " + f"Latest available: **{last_date_in_data}**. " + f"Please check back later β the dataset updates daily after market close." + ) + + return { + "fresh": fresh, + "last_date_in_data": last_date_in_data, + "expected_date": expected_date, + "message": message, + } + + +# ββ Feature / target extraction βββββββββββββββββββββββββββββββββββββββββββββββ + +def get_features_and_targets(df: pd.DataFrame): + """ + Extract input feature columns and target ETF return columns from the dataset. + + Returns: + input_features : list of column names + target_etfs : list of ETF return column names (e.g. TLT_Ret) + tbill_rate : latest 3m T-bill rate as a float (annualised, e.g. 0.045) + """ + # Target ETF return columns + target_etfs = [c for c in REQUIRED_ETF_COLS if c in df.columns] + + if not target_etfs: + raise ValueError( + f"No target ETF columns found. Expected: {REQUIRED_ETF_COLS}. " + f"Found in dataset: {list(df.columns)}" + ) + + # Input features: Z-scores, vol, regime, yield curve, credit, rates, VIX terms + exclude = set(target_etfs + BENCHMARK_COLS + [TBILL_COL]) + input_features = [ + c for c in df.columns + if c not in exclude + and ( + c.endswith("_Z") + or c.endswith("_Vol") + or "Regime" in c + or "YC_" in c + or "Credit_" in c + or "Rates_" in c + or "VIX_" in c + or "Spread" in c + or "DXY" in c + or "VIX" in c + or "T10Y" in c + ) + ] + + # 3m T-bill rate (for CASH return & Sharpe) + tbill_rate = 0.045 # default fallback + if TBILL_COL in df.columns: + raw = df[TBILL_COL].dropna() + if len(raw) > 0: + last_val = raw.iloc[-1] + # DTB3 is typically in percent (e.g. 5.25 means 5.25%) + tbill_rate = float(last_val) / 100 if last_val > 1 else float(last_val) + + return input_features, target_etfs, tbill_rate + + +# ββ Column info helper (for sidebar display) ββββββββββββββββββββββββββββββββββ + +def dataset_summary(df: pd.DataFrame) -> dict: + """Return a brief summary dict for sidebar display.""" + if df.empty: + return {} + return { + "rows": len(df), + "columns": len(df.columns), + "start_date": df.index[0].strftime("%Y-%m-%d"), + "end_date": df.index[-1].strftime("%Y-%m-%d"), + "etfs_found": [c for c in REQUIRED_ETF_COLS if c in df.columns], + "benchmarks": [c for c in BENCHMARK_COLS if c in df.columns], + "tbill_found": TBILL_COL in df.columns, + } diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md index db5c9c5e3605dc62b9c2835d44bba4f98c526886..76b8435744e41334f32c6f1ad441d3b27fd29b2c 100644 --- a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md @@ -1,19 +1,115 @@ +# P2-ETF-CNN-LSTM-ALTERNATIVE-APPROACHES + +Macro-driven ETF rotation using three augmented CNN-LSTM variants. +Winner selected by **highest raw annualised return** on the out-of-sample test set. + +--- + +## Architecture Overview + +| Approach | Core Idea | Key Addition | +|---|---|---| +| **1 β Wavelet** | DWT decomposes each macro signal into frequency subbands before the CNN | Separates trend / cycle / noise | +| **2 β Regime-Conditioned** | HMM detects macro regimes; one-hot regime label concatenated into the network | Removes non-stationarity | +| **3 β Multi-Scale Parallel** | Three CNN towers (kernels 3, 7, 21 days) run in parallel before the LSTM | Captures momentum + cycle + trend simultaneously | + --- -title: P2 ETF CNN LSTM ALTERNATIVE APPROACHES -emoji: π -colorFrom: red -colorTo: red -sdk: docker -app_port: 8501 -tags: -- streamlit -pinned: false -short_description: Streamlit template space + +## ETF Universe + +| Ticker | Description | +|---|---| +| TLT | 20+ Year Treasury Bond | +| TBT | 20+ Year Treasury Short (2Γ) | +| VNQ | Real Estate (REIT) | +| SLV | Silver | +| GLD | Gold | +| CASH | 3m T-bill rate (from HF dataset) | + +Benchmarks (chart only, not traded): **SPY**, **AGG** + +--- + +## Data + +All data sourced exclusively from: +**`P2SAMAPA/fi-etf-macro-signal-master-data`** (HuggingFace Dataset) +File: `master_data.parquet` + +No external API calls (no yfinance, no FRED). +The app checks daily whether the prior NYSE trading day's data is present in the dataset. + --- -# Welcome to Streamlit! +## Project Structure + +``` +βββ .github/ +β βββ workflows/ +β βββ sync.yml # Auto-sync GitHub β HF Space on push to main +β +βββ app.py # Streamlit orchestrator (UI wiring only) +β +βββ data/ +β βββ loader.py # HF dataset load, freshness check, column validation +β +βββ models/ +β βββ base.py # Shared: sequences, splits, scaling, callbacks +β βββ approach1_wavelet.py # Wavelet CNN-LSTM +β βββ approach2_regime.py # Regime-Conditioned CNN-LSTM +β βββ approach3_multiscale.py # Multi-Scale Parallel CNN-LSTM +β +βββ strategy/ +β βββ backtest.py # execute_strategy, metrics, winner selection +β +βββ signals/ +β βββ conviction.py # Z-score conviction scoring +β +βββ ui/ +β βββ components.py # Banner, conviction panel, metrics, audit trail +β βββ charts.py # Plotly equity curve + comparison bar chart +β +βββ utils/ +β βββ calendar.py # NYSE calendar, next trading day, EST time +β +βββ requirements.txt +βββ README.md +``` + +--- + +## Secrets Required + +| Secret | Where | Purpose | +|---|---|---| +| `HF_TOKEN` | GitHub + HF Space | Read HF dataset Β· Sync HF Space | + +Set in: +- GitHub: `Settings β Secrets β Actions β New repository secret` +- HF Space: `Settings β Repository secrets` + +--- + +## Deployment + +Push to `main` β GitHub Actions (`sync.yml`) automatically syncs to HF Space. + +### Local development + +```bash +pip install -r requirements.txt +export HF_TOKEN=your_token +streamlit run app.py +``` + +--- -Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart: +## Output UI -If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community -forums](https://discuss.streamlit.io). +1. **Data freshness warning** β alerts if prior NYSE trading day data is missing +2. **Next Trading Day Signal** β date + ETF from the winning approach +3. **Signal Conviction** β Z-score gauge + per-ETF probability bars +4. **Performance Metrics** β Annualised Return, Sharpe, Hit Ratio, Max DD +5. **Approach Comparison Table** β all three approaches side by side +6. **Equity Curves** β all three approaches + SPY + AGG benchmarks +7. **Audit Trail** β last 20 trading days for the winning approach diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py new file mode 100644 index 0000000000000000000000000000000000000000..eded65d240364585407db8613bb5c2acdbe1d7d9 --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/app.py @@ -0,0 +1,273 @@ +""" +app.py +P2-ETF-CNN-LSTM-ALTERNATIVE-APPROACHES +Streamlit orchestrator β UI wiring only, no business logic here. +""" + +import os +import streamlit as st +import pandas as pd +import numpy as np + +# ββ Module imports ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +from data.loader import load_dataset, check_data_freshness, get_features_and_targets, dataset_summary +from utils.calendar import get_est_time, is_sync_window, get_next_signal_date +from models.base import build_sequences, train_val_test_split, scale_features, returns_to_labels +from models.approach1_wavelet import train_approach1, predict_approach1 +from models.approach2_regime import train_approach2, predict_approach2 +from models.approach3_multiscale import train_approach3, predict_approach3 +from strategy.backtest import execute_strategy, select_winner, build_comparison_table +from signals.conviction import compute_conviction +from ui.components import ( + show_freshness_status, show_signal_banner, show_conviction_panel, + show_metrics_row, show_comparison_table, show_audit_trail, +) +from ui.charts import equity_curve_chart, comparison_bar_chart + +# ββ Page config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +st.set_page_config( + page_title="P2-ETF-CNN-LSTM", + page_icon="π§ ", + layout="wide", +) + +# ββ Secrets βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +HF_TOKEN = os.getenv("HF_TOKEN", "") + +# ββ Sidebar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +with st.sidebar: + st.header("βοΈ Configuration") + + now_est = get_est_time() + st.write(f"π **EST:** {now_est.strftime('%H:%M:%S')}") + if is_sync_window(): + st.success("β Sync Window Active") + else: + st.info("βΈοΈ Sync Window Inactive") + + st.divider() + + start_yr = st.slider("π Start Year", 2010, 2024, 2016) + fee_bps = st.slider("π° Fee (bps)", 0, 50, 10) + lookback = st.slider("π Lookback (days)", 20, 60, 30, step=5) + epochs = st.number_input("π Max Epochs", 20, 300, 100, step=10) + + st.divider() + + split_option = st.selectbox("π Train/Val/Test Split", ["70/15/15", "80/10/10"], index=0) + split_map = {"70/15/15": (0.70, 0.15), "80/10/10": (0.80, 0.10)} + train_pct, val_pct = split_map[split_option] + + include_cash = st.checkbox("π΅ Include CASH class", value=True, + help="Model can select CASH (earns T-bill rate) as an alternative to any ETF") + + st.divider() + + run_button = st.button("π Run All 3 Approaches", type="primary", use_container_width=True) + +# ββ Title βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +st.title("π§ P2-ETF-CNN-LSTM") +st.caption("Approach 1: Wavelet Β· Approach 2: Regime-Conditioned Β· Approach 3: Multi-Scale Parallel") +st.caption("Winner selected by highest raw annualised return on out-of-sample test set.") + +# ββ Load data (always, to check freshness) ββββββββββββββββββββββββββββββββββββ +if not HF_TOKEN: + st.error("β HF_TOKEN secret not found. Please add it to your HF Space / GitHub secrets.") + st.stop() + +with st.spinner("π‘ Loading dataset from HuggingFace..."): + df = load_dataset(HF_TOKEN) + +if df.empty: + st.stop() + +# ββ Freshness check βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +freshness = check_data_freshness(df) +show_freshness_status(freshness) + +# ββ Dataset summary in sidebar ββββββββββββββββββββββββββββββββββββββββββββββββ +with st.sidebar: + st.divider() + st.subheader("π¦ Dataset Info") + summary = dataset_summary(df) + if summary: + st.write(f"**Rows:** {summary['rows']:,}") + st.write(f"**Range:** {summary['start_date']} β {summary['end_date']}") + st.write(f"**ETFs:** {', '.join([e.replace('_Ret','') for e in summary['etfs_found']])}") + st.write(f"**Benchmarks:** {', '.join([b.replace('_Ret','') for b in summary['benchmarks']])}") + st.write(f"**T-bill col:** {'β ' if summary['tbill_found'] else 'β'}") + +# ββ Main execution ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +if not run_button: + st.info("π Configure parameters in the sidebar and click **π Run All 3 Approaches** to begin.") + st.stop() + +# ββ Filter by start year ββββββββββββββββββββββββββββββββββββββββββββββββββββββ +df = df[df.index.year >= start_yr].copy() +st.write(f"π **Data:** {df.index[0].strftime('%Y-%m-%d')} β {df.index[-1].strftime('%Y-%m-%d')} " + f"({df.index[-1].year - df.index[0].year + 1} years)") + +# ββ Feature / target extraction βββββββββββββββββββββββββββββββββββββββββββββββ +try: + input_features, target_etfs, tbill_rate = get_features_and_targets(df) +except ValueError as e: + st.error(str(e)) + st.stop() + +st.info(f"π― **Targets:** {len(target_etfs)} ETFs Β· **Features:** {len(input_features)} signals Β· " + f"**T-bill rate:** {tbill_rate*100:.2f}%") + +# ββ Prepare sequences βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +X_raw = df[input_features].values.astype(np.float32) +y_raw = df[target_etfs].values.astype(np.float32) +n_etfs = len(target_etfs) +n_classes = n_etfs + (1 if include_cash else 0) # +1 for CASH + +# Fill NaNs with column means +col_means = np.nanmean(X_raw, axis=0) +for j in range(X_raw.shape[1]): + mask = np.isnan(X_raw[:, j]) + X_raw[mask, j] = col_means[j] + +X_seq, y_seq = build_sequences(X_raw, y_raw, lookback) +y_labels = returns_to_labels(y_seq, include_cash=include_cash) + +X_train, y_train_r, X_val, y_val_r, X_test, y_test_r = train_val_test_split(X_seq, y_seq, train_pct, val_pct) +_, y_train_l, _, y_val_l, _, y_test_l = train_val_test_split(X_seq, y_labels, train_pct, val_pct) + +X_train_s, X_val_s, X_test_s, _ = scale_features(X_train, X_val, X_test) + +train_size = len(X_train) +val_size = len(X_val) + +# Test dates (aligned with y_test) +test_start = lookback + train_size + val_size +test_dates = df.index[test_start: test_start + len(X_test)] +test_slice = slice(test_start, test_start + len(X_test)) + +st.success(f"β Sequences β Train: {train_size} Β· Val: {val_size} Β· Test: {len(X_test)}") + +# ββ Train all three approaches ββββββββββββββββββββββββββββββββββββββββββββββββ +results = {} +trained_info = {} # store extra info needed for conviction + +progress = st.progress(0, text="Starting training...") + +# ββ Approach 1: Wavelet βββββββββββββββββββββββββββββββββββββββββββββββββββββββ +with st.spinner("π Training Approach 1 β Wavelet CNN-LSTM..."): + try: + model1, hist1, _ = train_approach1( + X_train_s, y_train_l, + X_val_s, y_val_l, + n_classes=n_classes, epochs=int(epochs), + ) + preds1, proba1 = predict_approach1(model1, X_test_s) + results["Approach 1"] = execute_strategy( + preds1, proba1, y_test_r, test_dates, target_etfs, fee_bps, tbill_rate, include_cash, + ) + trained_info["Approach 1"] = {"proba": proba1} + st.success("β Approach 1 complete") + except Exception as e: + st.warning(f"β οΈ Approach 1 failed: {e}") + results["Approach 1"] = None + +progress.progress(33, text="Approach 1 done...") + +# ββ Approach 2: Regime-Conditioned βββββββββββββββββββββββββββββββββββββββββββ +with st.spinner("π Training Approach 2 β Regime-Conditioned CNN-LSTM..."): + try: + model2, hist2, hmm2, regime_cols2 = train_approach2( + X_train_s, y_train_l, + X_val_s, y_val_l, + X_flat_all=X_raw, + feature_names=input_features, + lookback=lookback, + train_size=train_size, + val_size=val_size, + n_classes=n_classes, epochs=int(epochs), + ) + preds2, proba2 = predict_approach2( + model2, X_test_s, X_raw, regime_cols2, hmm2, + lookback, train_size, val_size, + ) + results["Approach 2"] = execute_strategy( + preds2, proba2, y_test_r, test_dates, target_etfs, fee_bps, tbill_rate, include_cash, + ) + trained_info["Approach 2"] = {"proba": proba2} + st.success("β Approach 2 complete") + except Exception as e: + st.warning(f"β οΈ Approach 2 failed: {e}") + results["Approach 2"] = None + +progress.progress(66, text="Approach 2 done...") + +# ββ Approach 3: Multi-Scale βββββββββββββββββββββββββββββββββββββββββββββββββββ +with st.spinner("π‘ Training Approach 3 β Multi-Scale CNN-LSTM..."): + try: + model3, hist3 = train_approach3( + X_train_s, y_train_l, + X_val_s, y_val_l, + n_classes=n_classes, epochs=int(epochs), + ) + preds3, proba3 = predict_approach3(model3, X_test_s) + results["Approach 3"] = execute_strategy( + preds3, proba3, y_test_r, test_dates, target_etfs, fee_bps, tbill_rate, include_cash, + ) + trained_info["Approach 3"] = {"proba": proba3} + st.success("β Approach 3 complete") + except Exception as e: + st.warning(f"β οΈ Approach 3 failed: {e}") + results["Approach 3"] = None + +progress.progress(100, text="All approaches complete!") +progress.empty() + +# ββ Select winner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +winner_name = select_winner(results) +winner_res = results.get(winner_name) + +if winner_res is None: + st.error("β All approaches failed. Please check your data and configuration.") + st.stop() + +# ββ Next trading date βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +next_date = get_next_signal_date() + +st.divider() + +# ββ Signal banner (winner) ββββββββββββββββββββββββββββββββββββββββββββββββββββ +show_signal_banner(winner_res["next_signal"], next_date, winner_name) + +# ββ Conviction panel ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +winner_proba = trained_info[winner_name]["proba"] +conviction = compute_conviction(winner_proba[-1], target_etfs, include_cash) +show_conviction_panel(conviction) + +st.divider() + +# ββ Winner metrics ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +st.subheader(f"π {winner_name} β Performance Metrics") +show_metrics_row(winner_res, tbill_rate) + +st.divider() + +# ββ Comparison table ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +st.subheader("π Approach Comparison (Winner = Highest Raw Annualised Return)") +comparison_df = build_comparison_table(results, winner_name) +show_comparison_table(comparison_df) + +# ββ Comparison bar chart ββββββββββββββββββββββββββββββββββββββββββββββββββββββ +st.plotly_chart(comparison_bar_chart(results, winner_name), use_container_width=True) + +st.divider() + +# ββ Equity curves βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +st.subheader("π Out-of-Sample Equity Curves β All Approaches vs Benchmarks") +fig = equity_curve_chart(results, winner_name, test_dates, df, test_slice, tbill_rate) +st.plotly_chart(fig, use_container_width=True) + +st.divider() + +# ββ Audit trail (winner) ββββββββββββββββββββββββββββββββββββββββββββββββββββββ +st.subheader(f"π Audit Trail β {winner_name} (Last 20 Trading Days)") +show_audit_trail(winner_res["audit_trail"]) diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitattributes b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..5f51ead59f36f13043e036290df9440e25fe8cbe --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.13.5-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + git \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt ./ +COPY src/ ./src/ + +RUN pip3 install -r requirements.txt + +EXPOSE 8501 + +HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health + +ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"] \ No newline at end of file diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db5c9c5e3605dc62b9c2835d44bba4f98c526886 --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/README.md @@ -0,0 +1,19 @@ +--- +title: P2 ETF CNN LSTM ALTERNATIVE APPROACHES +emoji: π +colorFrom: red +colorTo: red +sdk: docker +app_port: 8501 +tags: +- streamlit +pinned: false +short_description: Streamlit template space +--- + +# Welcome to Streamlit! + +Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart: + +If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community +forums](https://discuss.streamlit.io). diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/requirements.txt b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..28d994e22f8dd432b51df193562052e315ad95f7 --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/requirements.txt @@ -0,0 +1,3 @@ +altair +pandas +streamlit \ No newline at end of file diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/src/streamlit_app.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/src/streamlit_app.py new file mode 100644 index 0000000000000000000000000000000000000000..99d0b84662681e7d21a08fcce44908344fa86f80 --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/src/streamlit_app.py @@ -0,0 +1,40 @@ +import altair as alt +import numpy as np +import pandas as pd +import streamlit as st + +""" +# Welcome to Streamlit! + +Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:. +If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community +forums](https://discuss.streamlit.io). + +In the meantime, below is an example of what you can do with just a few lines of code: +""" + +num_points = st.slider("Number of points in spiral", 1, 10000, 1100) +num_turns = st.slider("Number of turns in spiral", 1, 300, 31) + +indices = np.linspace(0, 1, num_points) +theta = 2 * np.pi * num_turns * indices +radius = indices + +x = radius * np.cos(theta) +y = radius * np.sin(theta) + +df = pd.DataFrame({ + "x": x, + "y": y, + "idx": indices, + "rand": np.random.randn(num_points), +}) + +st.altair_chart(alt.Chart(df, height=700, width=700) + .mark_point(filled=True) + .encode( + x=alt.X("x", axis=None), + y=alt.Y("y", axis=None), + color=alt.Color("idx", legend=None, scale=alt.Scale()), + size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])), + )) \ No newline at end of file diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/requirements.txt b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/requirements.txt index 28d994e22f8dd432b51df193562052e315ad95f7..0b1bc9a5b544b19aaa1f70c7ab427d1c5be3f9b2 100644 --- a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/requirements.txt +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/requirements.txt @@ -1,3 +1,29 @@ -altair -pandas -streamlit \ No newline at end of file +# Core +streamlit>=1.32.0 +pandas>=2.0.0 +numpy>=1.24.0 + +# Hugging Face +huggingface_hub>=0.21.0 +datasets>=2.18.0 + +# Machine Learning +tensorflow>=2.14.0 +scikit-learn>=1.3.0 +xgboost>=2.0.0 + +# Wavelet (Approach 1) +PyWavelets>=1.5.0 + +# Regime detection (Approach 2) +hmmlearn>=0.3.0 + +# Visualisation +plotly>=5.18.0 + +# NYSE Calendar +pandas_market_calendars>=4.3.0 +pytz>=2024.1 + +# Parquet +pyarrow>=14.0.0 diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/models/base.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/models/base.py new file mode 100644 index 0000000000000000000000000000000000000000..51a86329f292dd5bc931ddb615fb3fc76bf4a3fa --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/models/base.py @@ -0,0 +1,199 @@ +""" +models/base.py +Shared utilities for all three CNN-LSTM variants: + - Data preparation (sequences, train/val/test split) + - Common Keras layers / callbacks + - Predict + evaluate helpers +""" + +import numpy as np +import pandas as pd +from sklearn.preprocessing import RobustScaler +import tensorflow as tf +from tensorflow import keras + +# ββ Reproducibility βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +SEED = 42 +tf.random.set_seed(SEED) +np.random.seed(SEED) + + +# ββ Sequence builder ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def build_sequences(features: np.ndarray, targets: np.ndarray, lookback: int): + """ + Build supervised sequences for CNN-LSTM input. + + Args: + features : 2-D array [n_days, n_features] + targets : 2-D array [n_days, n_etfs] (raw returns) + lookback : number of past days per sample + + Returns: + X : [n_samples, lookback, n_features] + y : [n_samples, n_etfs] (raw returns for the next day) + """ + X, y = [], [] + for i in range(lookback, len(features)): + X.append(features[i - lookback: i]) + y.append(targets[i]) + return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32) + + +# ββ Train / val / test split ββββββββββββββββββββββββββββββββββββββββββββββββββ + +def train_val_test_split(X, y, train_pct=0.70, val_pct=0.15): + """Split sequences into train / val / test preserving temporal order.""" + n = len(X) + t1 = int(n * train_pct) + t2 = int(n * (train_pct + val_pct)) + + return ( + X[:t1], y[:t1], + X[t1:t2], y[t1:t2], + X[t2:], y[t2:], + ) + + +# ββ Feature scaling βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def scale_features(X_train, X_val, X_test): + """ + Fit RobustScaler on training data only, apply to val and test. + Operates on the flattened feature dimension. + + Returns scaled arrays with same shape as inputs. + """ + n_train, lb, n_feat = X_train.shape + scaler = RobustScaler() + + # Fit on train + scaler.fit(X_train.reshape(-1, n_feat)) + + def _transform(X): + shape = X.shape + return scaler.transform(X.reshape(-1, n_feat)).reshape(shape) + + return _transform(X_train), _transform(X_val), _transform(X_test), scaler + + +# ββ Label builder (classification: argmax of returns) ββββββββββββββββββββββββ + +def returns_to_labels(y_raw, include_cash=True, cash_threshold=0.0): + """ + Convert raw return matrix to integer class labels. + + If include_cash=True, adds a CASH class (index = n_etfs) when + the best ETF return is below cash_threshold. + + Args: + y_raw : [n_samples, n_etfs] + include_cash : whether to allow CASH class + cash_threshold : minimum ETF return to prefer over CASH + + Returns: + labels : [n_samples] integer class indices + """ + best = np.argmax(y_raw, axis=1) + if include_cash: + best_return = y_raw[np.arange(len(y_raw)), best] + cash_idx = y_raw.shape[1] + labels = np.where(best_return < cash_threshold, cash_idx, best) + else: + labels = best + return labels.astype(np.int32) + + +# ββ Common Keras callbacks ββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def get_callbacks(patience_es=15, patience_lr=8, min_lr=1e-6): + """Standard early stopping + reduce-LR callbacks shared by all models.""" + return [ + keras.callbacks.EarlyStopping( + monitor="val_loss", + patience=patience_es, + restore_best_weights=True, + verbose=0, + ), + keras.callbacks.ReduceLROnPlateau( + monitor="val_loss", + factor=0.5, + patience=patience_lr, + min_lr=min_lr, + verbose=0, + ), + ] + + +# ββ Common output head ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def classification_head(x, n_classes: int, dropout: float = 0.3): + """ + Shared dense output head for all three CNN-LSTM variants. + + Args: + x : input tensor + n_classes : number of ETF classes (+ 1 for CASH if applicable) + dropout : dropout rate + + Returns: + output tensor with softmax activation + """ + x = keras.layers.Dense(64, activation="relu")(x) + x = keras.layers.Dropout(dropout)(x) + x = keras.layers.Dense(n_classes, activation="softmax")(x) + return x + + +# ββ Prediction helper βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def predict_classes(model, X_test: np.ndarray) -> np.ndarray: + """Return integer class predictions from a Keras model.""" + proba = model.predict(X_test, verbose=0) + return np.argmax(proba, axis=1), proba + + +# ββ Metrics helper ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def evaluate_returns( + preds: np.ndarray, + proba: np.ndarray, + y_raw_test: np.ndarray, + target_etfs: list, + tbill_rate: float, + fee_bps: int, + include_cash: bool = True, +): + """ + Given integer class predictions and raw return matrix, + compute strategy returns and summary metrics. + + Returns: + strat_rets : np.ndarray of daily net returns + ann_return : annualised return (float) + cum_returns : cumulative return series + last_proba : probability vector for the last prediction + next_etf : name of ETF predicted for next session + """ + n_etfs = len(target_etfs) + strat_rets = [] + + for i, cls in enumerate(preds): + if include_cash and cls == n_etfs: + # CASH: earn daily T-bill rate + daily_tbill = tbill_rate / 252 + net = daily_tbill - (fee_bps / 10000) + else: + ret = y_raw_test[i][cls] + net = ret - (fee_bps / 10000) + strat_rets.append(net) + + strat_rets = np.array(strat_rets) + cum_returns = np.cumprod(1 + strat_rets) + ann_return = (cum_returns[-1] ** (252 / len(strat_rets))) - 1 + + last_proba = proba[-1] + next_cls = int(np.argmax(last_proba)) + next_etf = "CASH" if (include_cash and next_cls == n_etfs) else target_etfs[next_cls].replace("_Ret", "") + + return strat_rets, ann_return, cum_returns, last_proba, next_etf diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/models/approach1_wavelet.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/models/approach1_wavelet.py new file mode 100644 index 0000000000000000000000000000000000000000..8c946f7ebacbb83f1b50b6df2563c2aafd3af10e --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/models/approach1_wavelet.py @@ -0,0 +1,167 @@ +""" +models/approach1_wavelet.py +Approach 1: Wavelet Decomposition CNN-LSTM + +Pipeline: + Raw macro signals + β DWT (db4, level=3) per signal β multi-band channel stack + β 1D CNN (64 filters, k=3) β MaxPool β (32 filters, k=3) + β LSTM (128 units) + β Dense 64 β Softmax (n_etfs + 1 CASH) +""" + +import numpy as np +import pywt +import tensorflow as tf +from tensorflow import keras +from models.base import classification_head, get_callbacks + +WAVELET = "db4" +LEVEL = 3 + + +# ββ Wavelet feature engineering βββββββββββββββββββββββββββββββββββββββββββββββ + +def _wavelet_decompose_signal(signal: np.ndarray, wavelet: str, level: int) -> np.ndarray: + """ + Decompose a 1-D signal into DWT subbands and return them stacked. + + For a signal of length T: + coeffs = [cA_n, cD_n, cD_{n-1}, ..., cD_1] + We interpolate each subband back to length T so we can stack them. + + Returns: array of shape [T, level+1] + """ + T = len(signal) + coeffs = pywt.wavedec(signal, wavelet, level=level) + bands = [] + for c in coeffs: + # Interpolate back to original length + band = np.interp( + np.linspace(0, len(c) - 1, T), + np.arange(len(c)), + c, + ) + bands.append(band) + return np.stack(bands, axis=-1) # [T, level+1] + + +def apply_wavelet_transform(X: np.ndarray, wavelet: str = WAVELET, level: int = LEVEL) -> np.ndarray: + """ + Apply DWT to every feature channel across all samples. + + Args: + X : [n_samples, lookback, n_features] + + Returns: + X_wt : [n_samples, lookback, n_features * (level+1)] + """ + n_samples, lookback, n_features = X.shape + n_bands = level + 1 + X_wt = np.zeros((n_samples, lookback, n_features * n_bands), dtype=np.float32) + + for s in range(n_samples): + for f in range(n_features): + decomposed = _wavelet_decompose_signal(X[s, :, f], wavelet, level) # [T, n_bands] + start = f * n_bands + X_wt[s, :, start: start + n_bands] = decomposed + + return X_wt + + +# ββ Model builder βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def build_wavelet_cnn_lstm( + input_shape: tuple, + n_classes: int, + dropout: float = 0.3, + lstm_units: int = 128, +) -> keras.Model: + """ + Build Wavelet CNN-LSTM model. + + Args: + input_shape : (lookback, n_features * n_bands) β post-DWT shape + n_classes : number of output classes (ETFs + CASH) + dropout : dropout rate + lstm_units : LSTM hidden size + + Returns: + Compiled Keras model + """ + inputs = keras.Input(shape=input_shape, name="wavelet_input") + + # CNN block 1 + x = keras.layers.Conv1D(64, kernel_size=3, padding="causal", activation="relu")(inputs) + x = keras.layers.BatchNormalization()(x) + x = keras.layers.MaxPooling1D(pool_size=2)(x) + + # CNN block 2 + x = keras.layers.Conv1D(32, kernel_size=3, padding="causal", activation="relu")(x) + x = keras.layers.BatchNormalization()(x) + x = keras.layers.Dropout(dropout)(x) + + # LSTM + x = keras.layers.LSTM(lstm_units, dropout=dropout, recurrent_dropout=0.1)(x) + + # Output head + outputs = classification_head(x, n_classes, dropout) + + model = keras.Model(inputs, outputs, name="Approach1_Wavelet_CNN_LSTM") + model.compile( + optimizer=keras.optimizers.Adam(learning_rate=1e-3), + loss="sparse_categorical_crossentropy", + metrics=["accuracy"], + ) + return model + + +# ββ Full train pipeline βββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def train_approach1( + X_train, y_train, + X_val, y_val, + n_classes: int, + epochs: int = 100, + batch_size: int = 32, + dropout: float = 0.3, + lstm_units: int = 128, +): + """ + Apply wavelet transform then train the CNN-LSTM. + + Args: + X_train/val : [n, lookback, n_features] (scaled, pre-wavelet) + y_train/val : [n] integer class labels + n_classes : total output classes + + Returns: + model : trained Keras model + history : training history + wt_shape : post-DWT input shape (for inference) + """ + # Apply DWT + X_train_wt = apply_wavelet_transform(X_train) + X_val_wt = apply_wavelet_transform(X_val) + + input_shape = X_train_wt.shape[1:] # (lookback, n_features * n_bands) + model = build_wavelet_cnn_lstm(input_shape, n_classes, dropout, lstm_units) + + history = model.fit( + X_train_wt, y_train, + validation_data=(X_val_wt, y_val), + epochs=epochs, + batch_size=batch_size, + callbacks=get_callbacks(), + verbose=0, + ) + + return model, history, input_shape + + +def predict_approach1(model, X_test: np.ndarray) -> tuple: + """Apply DWT to test set then predict. Returns (class_preds, proba).""" + X_test_wt = apply_wavelet_transform(X_test) + proba = model.predict(X_test_wt, verbose=0) + preds = np.argmax(proba, axis=1) + return preds, proba diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/models/approach2_regime.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/models/approach2_regime.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/models/approach2_regime.py @@ -0,0 +1 @@ + diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/models/approach3_multiscale.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/models/approach3_multiscale.py new file mode 100644 index 0000000000000000000000000000000000000000..1b4e0821dacfec060ff2276feece20a75fc856cf --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/models/approach3_multiscale.py @@ -0,0 +1,150 @@ +""" +models/approach3_multiscale.py +Approach 3: Multi-Scale Parallel CNN-LSTM + +Pipeline: + Raw macro signals + β 3 parallel CNN towers: kernel 3 (short), 7 (medium), 21 (long) + β Concatenate [96 features] + β LSTM (128 units) + β Dense 64 β Softmax (n_etfs + 1 CASH) +""" + +import numpy as np +import tensorflow as tf +from tensorflow import keras +from models.base import classification_head, get_callbacks + +# Kernel sizes represent: momentum (3d), weekly cycle (7d), monthly trend (21d) +KERNEL_SIZES = [3, 7, 21] +FILTERS_EACH = 32 # 32 Γ 3 towers = 96 concatenated features + + +# ββ Model builder βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def build_multiscale_cnn_lstm( + input_shape: tuple, + n_classes: int, + kernel_sizes: list = None, + filters: int = FILTERS_EACH, + dropout: float = 0.3, + lstm_units: int = 128, +) -> keras.Model: + """ + Multi-scale parallel CNN-LSTM. + + Three CNN towers with different kernel sizes run in parallel on the + same input, capturing momentum, weekly cycle, and monthly trend + simultaneously. Their outputs are concatenated before the LSTM. + + Args: + input_shape : (lookback, n_features) + n_classes : number of output classes (ETFs + CASH) + kernel_sizes : list of kernel sizes for each tower + filters : number of Conv1D filters per tower + dropout : dropout rate + lstm_units : LSTM hidden size + + Returns: + Compiled Keras model + """ + if kernel_sizes is None: + kernel_sizes = KERNEL_SIZES + + inputs = keras.Input(shape=input_shape, name="multiscale_input") + + towers = [] + for k in kernel_sizes: + # Each tower: Conv β BN β Conv β BN β GlobalAvgPool + t = keras.layers.Conv1D( + filters, kernel_size=k, padding="causal", activation="relu", + name=f"conv1_k{k}" + )(inputs) + t = keras.layers.BatchNormalization(name=f"bn1_k{k}")(t) + t = keras.layers.Conv1D( + filters, kernel_size=k, padding="causal", activation="relu", + name=f"conv2_k{k}" + )(t) + t = keras.layers.BatchNormalization(name=f"bn2_k{k}")(t) + t = keras.layers.Dropout(dropout, name=f"drop_k{k}")(t) + towers.append(t) + + # Concatenate along the feature dimension β keeps temporal axis intact for LSTM + if len(towers) > 1: + merged = keras.layers.Concatenate(axis=-1, name="tower_concat")(towers) + else: + merged = towers[0] + + # LSTM integrates multi-scale temporal features + x = keras.layers.LSTM(lstm_units, dropout=dropout, recurrent_dropout=0.1, name="lstm")(merged) + + # Output head + outputs = classification_head(x, n_classes, dropout) + + model = keras.Model(inputs, outputs, name="Approach3_MultiScale_CNN_LSTM") + model.compile( + optimizer=keras.optimizers.Adam(learning_rate=1e-3), + loss="sparse_categorical_crossentropy", + metrics=["accuracy"], + ) + return model + + +# ββ Full train pipeline βββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def train_approach3( + X_train, y_train, + X_val, y_val, + n_classes: int, + epochs: int = 100, + batch_size: int = 32, + dropout: float = 0.3, + lstm_units: int = 128, + kernel_sizes: list = None, +): + """ + Build and train the multi-scale CNN-LSTM. + + Args: + X_train/val : [n, lookback, n_features] + y_train/val : [n] integer class labels + n_classes : total output classes + + Returns: + model : trained Keras model + history : training history + """ + if kernel_sizes is None: + kernel_sizes = KERNEL_SIZES + + # Guard: lookback must be >= largest kernel + lookback = X_train.shape[1] + valid_kernels = [k for k in kernel_sizes if k <= lookback] + if not valid_kernels: + valid_kernels = [min(3, lookback)] + + model = build_multiscale_cnn_lstm( + input_shape=X_train.shape[1:], + n_classes=n_classes, + kernel_sizes=valid_kernels, + dropout=dropout, + lstm_units=lstm_units, + ) + + history = model.fit( + X_train, y_train, + validation_data=(X_val, y_val), + epochs=epochs, + batch_size=batch_size, + callbacks=get_callbacks(), + verbose=0, + ) + + return model, history + + +def predict_approach3(model, X_test: np.ndarray) -> tuple: + """Predict on test set. Returns (class_preds, proba).""" + proba = model.predict(X_test, verbose=0) + preds = np.argmax(proba, axis=1) + return preds, proba diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/strategy/backtest.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/strategy/backtest.py new file mode 100644 index 0000000000000000000000000000000000000000..0accf7a328637edb158806e7ce682fc66080cc50 --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/strategy/backtest.py @@ -0,0 +1,193 @@ +""" +strategy/backtest.py +Strategy execution, performance metrics, and benchmark calculations. +Supports CASH as a class (earns T-bill rate when selected). +""" + +import numpy as np +import pandas as pd +from datetime import datetime + + +# ββ Strategy execution ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def execute_strategy( + preds: np.ndarray, + proba: np.ndarray, + y_raw_test: np.ndarray, + test_dates: pd.DatetimeIndex, + target_etfs: list, + fee_bps: int, + tbill_rate: float, + include_cash: bool = True, +) -> dict: + """ + Execute strategy from model predictions. + + Args: + preds : [n] integer class predictions + proba : [n, n_classes] softmax probabilities + y_raw_test : [n, n_etfs] actual next-day ETF returns + test_dates : DatetimeIndex aligned with y_raw_test + target_etfs : list of ETF return column names e.g. ["TLT_Ret", ...] + fee_bps : transaction fee in basis points + tbill_rate : annualised 3m T-bill rate (e.g. 0.045) + include_cash: whether CASH is a valid class (index = n_etfs) + + Returns: + dict with keys: + strat_rets, cum_returns, ann_return, sharpe, + hit_ratio, max_dd, max_daily_dd, cum_max, + audit_trail, next_signal, next_proba + """ + n_etfs = len(target_etfs) + daily_tbill = tbill_rate / 252 + today = datetime.now().date() + + strat_rets = [] + audit_trail = [] + + for i, cls in enumerate(preds): + if include_cash and cls == n_etfs: + signal_etf = "CASH" + realized_ret = daily_tbill + else: + cls = min(cls, n_etfs - 1) + signal_etf = target_etfs[cls].replace("_Ret", "") + realized_ret = float(y_raw_test[i][cls]) + + net_ret = realized_ret - (fee_bps / 10000) + strat_rets.append(net_ret) + + trade_date = test_dates[i] + if trade_date.date() < today: + audit_trail.append({ + "Date": trade_date.strftime("%Y-%m-%d"), + "Signal": signal_etf, + "Realized": realized_ret, + "Net_Return": net_ret, + }) + + strat_rets = np.array(strat_rets, dtype=np.float64) + + # Next signal (last prediction) + last_cls = int(preds[-1]) + next_proba = proba[-1] + + if include_cash and last_cls == n_etfs: + next_signal = "CASH" + else: + last_cls = min(last_cls, n_etfs - 1) + next_signal = target_etfs[last_cls].replace("_Ret", "") + + metrics = _compute_metrics(strat_rets, tbill_rate) + + return { + **metrics, + "strat_rets": strat_rets, + "audit_trail": audit_trail, + "next_signal": next_signal, + "next_proba": next_proba, + } + + +# ββ Performance metrics βββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def _compute_metrics(strat_rets: np.ndarray, tbill_rate: float) -> dict: + if len(strat_rets) == 0: + return {} + + cum_returns = np.cumprod(1 + strat_rets) + n = len(strat_rets) + ann_return = float(cum_returns[-1] ** (252 / n) - 1) + + excess = strat_rets - tbill_rate / 252 + sharpe = float(np.mean(excess) / (np.std(strat_rets) + 1e-9) * np.sqrt(252)) + + recent = strat_rets[-15:] + hit_ratio = float(np.mean(recent > 0)) + + cum_max = np.maximum.accumulate(cum_returns) + drawdown = (cum_returns - cum_max) / cum_max + max_dd = float(np.min(drawdown)) + max_daily = float(np.min(strat_rets)) + + return { + "cum_returns": cum_returns, + "ann_return": ann_return, + "sharpe": sharpe, + "hit_ratio": hit_ratio, + "max_dd": max_dd, + "max_daily_dd":max_daily, + "cum_max": cum_max, + } + + +def compute_benchmark_metrics(returns: np.ndarray, tbill_rate: float) -> dict: + """Compute metrics for a benchmark return series.""" + return _compute_metrics(returns, tbill_rate) + + +# ββ Winner selection ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def select_winner(results: dict) -> str: + """ + Given a dict of {approach_name: result_dict}, return the approach name + with the highest annualised return (raw, not risk-adjusted). + + Args: + results : {"Approach 1": {...}, "Approach 2": {...}, "Approach 3": {...}} + + Returns: + winner_name : str + """ + best_name = None + best_return = -np.inf + + for name, res in results.items(): + if res is None: + continue + ret = res.get("ann_return", -np.inf) + if ret > best_return: + best_return = ret + best_name = name + + return best_name + + +# ββ Comparison table ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def build_comparison_table(results: dict, winner_name: str) -> pd.DataFrame: + """ + Build a summary DataFrame comparing all three approaches. + + Args: + results : {name: result_dict} + winner_name : name of the winner + + Returns: + pd.DataFrame with one row per approach + """ + rows = [] + for name, res in results.items(): + if res is None: + rows.append({ + "Approach": name, + "Ann. Return": "N/A", + "Sharpe": "N/A", + "Hit Ratio (15d)":"N/A", + "Max Drawdown": "N/A", + "Winner": "", + }) + continue + + rows.append({ + "Approach": name, + "Ann. Return": f"{res['ann_return']*100:.2f}%", + "Sharpe": f"{res['sharpe']:.2f}", + "Hit Ratio (15d)": f"{res['hit_ratio']*100:.0f}%", + "Max Drawdown": f"{res['max_dd']*100:.2f}%", + "Winner": "β WINNER" if name == winner_name else "", + }) + + return pd.DataFrame(rows) diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/signals/conviction.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/signals/conviction.py new file mode 100644 index 0000000000000000000000000000000000000000..1d0365c502e88cd0fe5f487ac2c1f902d3b1a813 --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/signals/conviction.py @@ -0,0 +1,93 @@ +""" +signals/conviction.py +Signal conviction scoring via Z-score of model probabilities. +""" + +import numpy as np + + +CONVICTION_THRESHOLDS = { + "Very High": 2.0, + "High": 1.0, + "Moderate": 0.0, + # Below 0.0 β "Low" +} + + +def compute_conviction(proba: np.ndarray, target_etfs: list, include_cash: bool = True) -> dict: + """ + Compute Z-score conviction for the selected signal. + + Args: + proba : 1-D softmax probability vector [n_classes] + target_etfs : list of ETF return column names (e.g. ["TLT_Ret", ...]) + include_cash: whether CASH is the last class + + Returns: + dict with keys: + best_idx : int + best_name : str (ETF ticker or "CASH") + z_score : float + label : str ("Very High" / "High" / "Moderate" / "Low") + scores : np.ndarray (raw proba) + etf_names : list of display names + sorted_pairs : list of (name, score) sorted highβlow + """ + scores = np.array(proba, dtype=float) + best_idx = int(np.argmax(scores)) + n_etfs = len(target_etfs) + + # Display names + etf_names = [e.replace("_Ret", "") for e in target_etfs] + if include_cash: + etf_names = etf_names + ["CASH"] + + best_name = etf_names[best_idx] if best_idx < len(etf_names) else "CASH" + + # Z-score + mean = np.mean(scores) + std = np.std(scores) + z = float((scores[best_idx] - mean) / std) if std > 1e-9 else 0.0 + + # Label + label = "Low" + for lbl, threshold in CONVICTION_THRESHOLDS.items(): + if z >= threshold: + label = lbl + break + + # Sorted pairs for UI bar chart + sorted_pairs = sorted( + zip(etf_names, scores), + key=lambda x: x[1], + reverse=True, + ) + + return { + "best_idx": best_idx, + "best_name": best_name, + "z_score": z, + "label": label, + "scores": scores, + "etf_names": etf_names, + "sorted_pairs": sorted_pairs, + } + + +def conviction_color(label: str) -> str: + """Return hex accent colour for a conviction label.""" + return { + "Very High": "#00b894", + "High": "#00cec9", + "Moderate": "#fdcb6e", + "Low": "#d63031", + }.get(label, "#888888") + + +def conviction_icon(label: str) -> str: + return { + "Very High": "π’", + "High": "π’", + "Moderate": "π‘", + "Low": "π΄", + }.get(label, "βͺ") diff --git a/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/ui/components.py b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/ui/components.py new file mode 100644 index 0000000000000000000000000000000000000000..059d2da8ec8c5a2be4040d1c1702d28f0029361e --- /dev/null +++ b/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/hf_space/ui/components.py @@ -0,0 +1,229 @@ +""" +ui/components.py +Reusable Streamlit UI blocks: + - Freshness warning banner + - Next trading day signal banner + - Signal conviction panel + - Metrics row + - Audit trail table + - Comparison summary table +""" + +import streamlit as st +import pandas as pd +import numpy as np + +from signals.conviction import conviction_color, conviction_icon + + +# ββ Freshness warning βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + +def show_freshness_status(freshness: dict): + """Display data freshness status. Stops app if data is stale.""" + if freshness.get("fresh"): + st.success(freshness["message"]) + else: + st.warning(freshness["message"]) + + +# ββ Next trading day banner βββββββββββββββββββββββββββββββββββββββββββββββββββ + +def show_signal_banner(next_signal: str, next_date, approach_name: str): + """Large coloured banner showing the winning approach's next signal.""" + is_cash = next_signal == "CASH" + bg = "linear-gradient(135deg, #2d3436 0%, #1a1a2e 100%)" if is_cash else \ + "linear-gradient(135deg, #00d1b2 0%, #00a896 100%)" + + st.markdown(f""" +