# space/tools/ts_preprocess.py import pandas as pd import numpy as np from typing import List MONTH = "MS" # month-start frequency def _emi(principal: float, annual_rate: float, n_months: int) -> float: """ EMI formula with monthly compounding. r_m = annual_rate / 12 EMI = P * r_m * (1+r_m)^n / ((1+r_m)^n - 1) """ if n_months <= 0 or principal <= 0: return 0.0 r = annual_rate / 12.0 if r <= 0: return principal / n_months fac = (1.0 + r) ** n_months return principal * r * fac / (fac - 1.0) def _project_deposit(principal: float, annual_rate: float, months: int) -> pd.DataFrame: """ Monthly path for a deposit. Value compounds monthly. """ r = annual_rate / 12.0 data = [] bal = principal for m in range(months + 1): data.append({"step": m, "portfolio_value": bal}) bal = bal * (1.0 + r) return pd.DataFrame(data) def _project_asset(principal: float, annual_rate: float, tenor_months: int) -> pd.DataFrame: """ Monthly amortization schedule for an asset/loan using EMI. """ emi = _emi(principal, annual_rate, tenor_months) r = annual_rate / 12.0 data = [] bal = principal for m in range(tenor_months + 1): interest = bal * r principal_pay = max(0.0, emi - interest) next_bal = max(0.0, bal - principal_pay) data.append({ "step": m, "portfolio_value": bal, "emi": emi, "interest_component": interest, "principal_component": principal_pay, "remaining_balance": next_bal }) bal = next_bal return pd.DataFrame(data) def build_timeseries(df: pd.DataFrame) -> pd.DataFrame: """ Input df columns (example): - portfolio_date (datetime or str) - instrument_type: 'Deposit' or 'Asset' - balance: float - interest_rate: annual rate (e.g., 0.12) - time_to_maturity: months (int) - tenor_months: months (for Assets; if missing, fallback to time_to_maturity) Output: Long time-series with monthly timestamps, projected 'portfolio_value' (and EMI breakdown for Assets). """ df = df.copy() if "timestamp" not in df.columns: df["timestamp"] = pd.to_datetime(df["portfolio_date"]) out_frames: List[pd.DataFrame] = [] for _, row in df.iterrows(): itype = str(row.get("instrument_type", "")).strip().lower() start = pd.to_datetime(row["timestamp"]) months = int(row.get("time_to_maturity", 0) or 0) principal = float(row.get("balance", 0.0) or 0.0) annual_rate = float(row.get("interest_rate", 0.0) or 0.0) if itype == "deposit": sched = _project_deposit(principal, annual_rate, months) elif itype == "asset": tenor = int(row.get("tenor_months", months) or months or 0) sched = _project_asset(principal, annual_rate, tenor) else: # unknown types: keep flat sched = pd.DataFrame({"step": range(months + 1), "portfolio_value": principal}) # Build timestamps: month-start frequency sched["timestamp"] = pd.date_range(start=start, periods=len(sched), freq=MONTH) # Carry identifiers for col in ["instrument_type", "interest_rate"]: if col in df.columns: sched[col] = row.get(col) sched["origin_portfolio_date"] = start sched["origin_balance"] = principal out_frames.append(sched) ts = pd.concat(out_frames, ignore_index=True) ts = ts.sort_values(["timestamp", "instrument_type"]).reset_index(drop=True) return ts