Tulitula commited on
Commit
53f36cd
·
verified ·
1 Parent(s): fb8592f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +525 -379
app.py CHANGED
@@ -1,12 +1,5 @@
1
  # app.py
2
- # Efficient Portfolio Advisor CML-consistent plotting + suggestion picker
3
- # Modality: Text. Optional reranking model: FinLang/finance-embeddings-investopedia
4
-
5
- import os
6
- import io
7
- import math
8
- import json
9
- import warnings
10
  warnings.filterwarnings("ignore")
11
 
12
  from typing import List, Tuple, Dict, Optional
@@ -15,96 +8,144 @@ import numpy as np
15
  import pandas as pd
16
  import matplotlib.pyplot as plt
17
  from PIL import Image
18
- import gradio as gr
19
  import requests
20
  import yfinance as yf
 
21
 
22
- _ST_MODEL = None # lazy load for embeddings
 
 
 
 
 
 
 
 
 
 
23
 
24
- # ---------------- Config ----------------
25
- DATA_DIR = "data"; os.makedirs(DATA_DIR, exist_ok=True)
26
- MARKET_TICKER = "VOO"
27
- MAX_TICKERS = 30
28
  DEFAULT_LOOKBACK_YEARS = 10
29
- DATASET_ROWS = 1000
 
 
 
 
30
 
31
  FRED_MAP = [
32
- (1, "DGS1"), (2, "DGS2"), (3, "DGS3"), (5, "DGS5"),
33
- (7, "DGS7"), (10, "DGS10"), (20, "DGS20"), (30, "DGS30"), (100, "DGS30")
 
 
 
 
 
 
 
34
  ]
35
 
36
- POS_COLS = ["ticker", "amount_usd", "weight_exposure", "beta"]
37
- SUG_COLS_HOLD = ["pick", "ticker", "weight_%", "amount_$"]
 
38
 
39
- # ---------------- Small helpers ----------------
40
- def fmt_pct(x: float, dec: int = 2) -> str:
41
- try: return f"{x*100:.{dec}f}%"
42
- except: return "—"
43
 
44
- def ensure_dir(p: str):
45
- os.makedirs(os.path.dirname(p), exist_ok=True)
46
 
47
  def fred_series_for_horizon(years: float) -> str:
48
  y = max(1.0, min(100.0, float(years)))
49
  for cutoff, code in FRED_MAP:
50
- if y <= cutoff: return code
 
51
  return "DGS30"
52
 
53
  def fetch_fred_yield_annual(code: str) -> float:
54
  url = f"https://fred.stlouisfed.org/graph/fredgraph.csv?id={code}"
55
  try:
56
- r = requests.get(url, timeout=10); r.raise_for_status()
 
57
  df = pd.read_csv(io.StringIO(r.text))
58
  s = pd.to_numeric(df.iloc[:, 1], errors="coerce").dropna()
59
  return float(s.iloc[-1] / 100.0) if len(s) else 0.03
60
- except: return 0.03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- # ---------------- Prices & returns ----------------
63
  def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
64
  start = pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=years, days=7)
65
  end = pd.Timestamp.today(tz="UTC")
66
- raw = yf.download(
67
  list(dict.fromkeys(tickers)),
68
- start=start.date(), end=end.date(),
69
- interval="1mo", auto_adjust=False, progress=False,
70
- group_by="ticker", threads=False
 
 
 
71
  )
72
- if raw is None or len(raw) == 0:
73
- return pd.DataFrame()
74
-
75
- if isinstance(raw.columns, pd.MultiIndex):
76
- price = None
77
- for field in ("Adj Close", "Close"):
78
- if field in raw.columns.get_level_values(-1):
79
- price = raw.xs(field, axis=1, level=-1, drop_level=True); break
80
- if price is None:
81
- price = raw.copy()
82
- price.columns = [c[0] if isinstance(c, tuple) else c for c in price.columns]
83
- else:
84
- if "Adj Close" in raw.columns: price = raw["Adj Close"]
85
- elif "Close" in raw.columns: price = raw["Close"]
86
- else: price = raw
87
-
88
- if isinstance(price, pd.Series): price = price.to_frame()
89
- price = price.dropna(how="all").fillna(method="ffill")
90
- price = price.loc[:, ~pd.Index(price.columns).duplicated()]
91
- return price
92
 
93
  def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
94
  return prices.pct_change().dropna()
95
 
96
- def annualize_mean(m): return np.asarray(m, dtype=float) * 12.0
97
- def annualize_sigma(s): return np.asarray(s, dtype=float) * math.sqrt(12.0)
 
 
 
98
 
99
- # ---------------- Search & validation ----------------
100
  def yahoo_search(query: str):
101
- if not query or not query.strip(): return []
 
102
  url = "https://query1.finance.yahoo.com/v1/finance/search"
103
  params = {"q": query.strip(), "quotesCount": 10, "newsCount": 0}
104
  headers = {"User-Agent": "Mozilla/5.0"}
105
  try:
106
  r = requests.get(url, params=params, headers=headers, timeout=10)
107
- r.raise_for_status(); data = r.json()
 
108
  out = []
109
  for q in data.get("quotes", []):
110
  sym = q.get("symbol")
@@ -113,58 +154,63 @@ def yahoo_search(query: str):
113
  if sym and sym.isascii():
114
  out.append({"symbol": sym, "name": name, "exchange": exch})
115
  if not out:
116
- out = [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": ""}]
117
  return out[:10]
118
- except:
119
- return [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": ""}]
120
 
121
  def validate_tickers(symbols: List[str], years: int) -> List[str]:
122
- base = list(dict.fromkeys([s.strip().upper() for s in symbols if s.strip()]))[:MAX_TICKERS]
123
- px = fetch_prices_monthly(base + [MARKET_TICKER], years)
 
 
 
124
  ok = [s for s in base if s in px.columns]
125
  return ok
126
 
127
- # ---------------- Aligned CAPM moments (now includes MARKET in cov & μ) ----------------
128
  def get_aligned_monthly_returns(symbols: List[str], years: int) -> pd.DataFrame:
129
- uniq = [c for c in dict.fromkeys(symbols)]
130
- px = fetch_prices_monthly(uniq, years)
 
131
  rets = monthly_returns(px)
132
- cols = [c for c in uniq if c in rets.columns]
133
  R = rets[cols].dropna(how="any")
134
  return R.loc[:, ~R.columns.duplicated()]
135
 
136
  def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
137
  R = get_aligned_monthly_returns(symbols + [MARKET_TICKER], years)
138
  if MARKET_TICKER not in R.columns or R.shape[0] < 3:
139
- raise ValueError("Not enough aligned data to estimate moments.")
140
  rf_m = rf_ann / 12.0
141
 
142
- # Means
143
- mu_m = R[MARKET_TICKER]; mu_m_ann = float(annualize_mean(mu_m.mean()))
144
- mu_all_ann = annualize_mean(R.mean(axis=0)) # pandas Series across all cols
145
- sigma_m_ann = float(annualize_sigma(mu_m.std(ddof=1)))
 
 
146
  erp_ann = float(mu_m_ann - rf_ann)
147
 
148
- # Betas vs market
149
- ex_m = mu_m - rf_m
150
- var_m = float(np.var(ex_m.values, ddof=1)); var_m = max(var_m, 1e-6)
 
151
  betas: Dict[str, float] = {}
152
- for s in R.columns:
153
  ex_s = R[s] - rf_m
154
  betas[s] = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1] / var_m)
155
- betas[MARKET_TICKER] = 1.0
156
 
157
- # Covariance includes MARKET_TICKER too
158
- cov_m = np.cov(R.values.T, ddof=1)
159
- covA = pd.DataFrame(cov_m * 12.0, index=R.columns, columns=R.columns)
160
 
161
- return {
162
- "betas": betas,
163
- "cov_ann": covA,
164
- "erp_ann": erp_ann,
165
- "sigma_m_ann": sigma_m_ann,
166
- "mu_all_ann": pd.Series(mu_all_ann, index=R.columns) # annualized means per asset incl. market
167
- }
 
168
 
169
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
170
  return float(rf_ann + beta * erp_ann)
@@ -175,191 +221,254 @@ def portfolio_stats(weights: Dict[str, float],
175
  rf_ann: float,
176
  erp_ann: float) -> Tuple[float, float, float]:
177
  tickers = list(weights.keys())
178
- if len(tickers) == 0: return 0.0, 0.0, 0.0
179
  w = np.array([weights[t] for t in tickers], dtype=float)
180
- gross = float(np.sum(np.abs(w))); w_expo = w / max(gross, 1e-12)
181
-
 
 
182
  beta_p = float(np.dot([betas.get(t, 0.0) for t in tickers], w_expo))
183
- er_p = capm_er(beta_p, rf_ann, erp_ann)
184
-
185
  cov = cov_ann.reindex(index=tickers, columns=tickers).fillna(0.0).to_numpy()
186
  sigma_p = math.sqrt(float(max(w_expo.T @ cov @ w_expo, 0.0)))
187
  return beta_p, er_p, sigma_p
188
 
189
- def portfolio_hist_return(weights: Dict[str, float], mu_all_ann: pd.Series) -> float:
190
- tickers = list(weights.keys())
191
- w = np.array([weights[t] for t in tickers], dtype=float)
192
- gross = float(np.sum(np.abs(w))); w_expo = w / max(gross, 1e-12)
193
- mu = mu_all_ann.reindex(tickers).fillna(0.0).to_numpy()
194
- return float(np.dot(mu, w_expo))
195
-
196
- # ---------------- CML plot (percent axes) ----------------
197
  def efficient_same_sigma(sigma_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
198
- if sigma_mkt <= 1e-12: return 0.0, 1.0, rf_ann
 
199
  a = sigma_target / sigma_mkt
200
  return a, 1.0 - a, rf_ann + a * erp_ann
201
 
202
  def efficient_same_return(mu_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
203
- if abs(erp_ann) <= 1e-12: return 0.0, 1.0, rf_ann
 
204
  a = (mu_target - rf_ann) / erp_ann
205
  return a, 1.0 - a, abs(a) * sigma_mkt
206
 
207
- def plot_cml_percent(base, suggestion=None) -> Image.Image:
208
- rf_ann = base["rf"]; erp = base["erp"]; sig_m = base["sigma_m"]
209
- pt_s = base["pt_sigma"]; pt_mu = base["pt_mu"]
210
- sames_s_s = base["same_sigma_sigma"]; sames_s_mu = base["same_sigma_mu"]
211
- same_mu_s = base["same_mu_sigma"]; same_mu_mu = base["same_mu_mu"]
212
-
213
- fig = plt.figure(figsize=(6,4), dpi=120)
214
- xmax = max(0.3, sig_m*2.0, pt_s*1.4, same_mu_s*1.4, sames_s_s*1.4, (suggestion["sigma"] if suggestion else 0.0)*1.4)
215
- xs = np.linspace(0, xmax, 160)
216
- slope = erp / max(sig_m, 1e-12)
 
 
 
 
 
217
  cml = rf_ann + slope * xs
218
- plt.plot(xs*100, cml*100, label="CML via Market")
219
-
220
- plt.scatter([0.0], [rf_ann*100], label="Risk-free (FRED)")
221
- plt.scatter([sig_m*100], [(rf_ann+erp)*100], label="Market VOO")
222
- plt.scatter([pt_s*100], [pt_mu*100], label="Your portfolio")
223
- plt.scatter([sames_s_s*100], [sames_s_mu*100], label="Efficient same σ")
224
- plt.scatter([same_mu_s*100], [same_mu_mu*100], label="Efficient same return")
225
-
226
- if suggestion:
227
- plt.scatter([suggestion["sigma"]*100], [suggestion["mu"]*100], label="Suggestion")
228
-
229
- plt.xlabel("σ (annualized, %)"); plt.ylabel("Expected return (annual, %)")
230
- plt.legend(loc="best", fontsize=8); plt.tight_layout()
231
-
232
- buf = io.BytesIO(); plt.savefig(buf, format="png"); plt.close(fig); buf.seek(0)
 
 
 
 
 
 
233
  return Image.open(buf)
234
 
235
- # ---------------- Synthetic dataset (universe only) ----------------
236
- def _row_exposures(row: pd.Series, universe: List[str]) -> Optional[np.ndarray]:
237
- try:
238
- ts = [t.strip() for t in str(row["tickers"]).split(",")]
239
- ws = [float(x) for x in str(row["weights"]).split(",")]
240
- wmap = {t: ws[i] for i, t in enumerate(ts) if i < len(ws)}
241
- w = np.array([wmap.get(t, 0.0) for t in universe], dtype=float)
242
- gross = float(np.sum(np.abs(w)));
243
- if gross <= 1e-12: return None
244
- return w / gross
245
- except: return None
246
 
247
- def build_synthetic_dataset(universe: List[str], years: int, rf_ann: float, erp_ann: float, n_rows: int = DATASET_ROWS) -> pd.DataFrame:
248
- moms = estimate_all_moments_aligned(universe, years, rf_ann)
249
- covA, betas = moms["cov_ann"], moms["betas"]
250
-
251
- rng = np.random.default_rng(12345); rows = []
252
- for i in range(n_rows):
253
- k = int(rng.integers(low=min(2, len(universe)), high=min(8, len(universe)) + 1))
254
  picks = list(rng.choice(universe, size=k, replace=False))
255
- signs = rng.choice([-1.0, 1.0], size=k, p=[0.2, 0.8])
256
  raw = rng.dirichlet(np.ones(k))
257
  gross = 1.0 + float(rng.gamma(2.0, 0.5))
258
- w = gross * signs * raw
259
- beta_p, er_p, sigma_p = portfolio_stats({picks[j]: w[j] for j in range(k)}, covA, betas, rf_ann, erp_ann)
 
 
 
260
  rows.append({
261
  "id": i,
 
262
  "tickers": ",".join(picks),
263
  "weights": ",".join(f"{x:.6f}" for x in w),
264
- "er_p": er_p, "sigma_p": sigma_p, "beta_p": beta_p
 
 
265
  })
266
  return pd.DataFrame(rows)
267
 
268
- def dataset_path_for_universe(universe: List[str]) -> str:
269
- key = ",".join(sorted(universe))
270
- h = abs(hash(key)) % (10**8)
271
- return os.path.join(DATA_DIR, f"investor_profiles_{h}.csv")
272
-
273
- # ---------------- Suggestions (build + picker) ----------------
274
- def _risk_targets(sigmas: np.ndarray) -> Dict[str, float]:
275
- return {"Low": float(np.quantile(sigmas, 0.15)),
276
- "Medium": float(np.quantile(sigmas, 0.50)),
277
- "High": float(np.quantile(sigmas, 0.85))}
278
-
279
- def _describe_row_for_embeddings(row: pd.Series, universe: List[str]) -> str:
280
- parts = [f"sigma {row['sigma_p']:.4f}", f"beta {row['beta_p']:.2f}", f"expected return {row['er_p']:.4f}"]
281
- ex = _row_exposures(row, universe)
282
- if ex is not None:
283
- top = sorted([(universe[i], float(abs(ex[i]))) for i in range(len(universe))], key=lambda kv: -kv[1])[:4]
284
- parts.append("focus " + ", ".join([f"{t}:{w:.2f}" for t, w in top]))
285
- return " ".join(parts)
286
-
287
- def _maybe_load_st_model():
288
- global _ST_MODEL
289
- if _ST_MODEL is None:
290
- from sentence_transformers import SentenceTransformer
291
- _ST_MODEL = SentenceTransformer("FinLang/finance-embeddings-investopedia")
292
- return _ST_MODEL
293
-
294
- def build_suggestions(csv_path: str,
295
- universe: List[str],
296
- total_amount: float,
297
- risk_level: str,
298
- use_embeddings: bool,
299
- covA: pd.DataFrame,
300
- betas: Dict[str, float],
301
- rf_ann: float,
302
- erp_ann: float,
303
- mu_all_ann: pd.Series):
304
- try: df = pd.read_csv(csv_path)
305
- except Exception: return [], pd.DataFrame(columns=SUG_COLS_HOLD)
306
-
307
- if df.empty: return [], pd.DataFrame(columns=SUG_COLS_HOLD)
308
-
309
- sigmas = df["sigma_p"].to_numpy(dtype=float)
310
- target_sigma = _risk_targets(sigmas).get(risk_level, float(np.median(sigmas)))
311
-
312
- df = df.copy(); df["dist"] = (df["sigma_p"] - target_sigma).abs()
313
- cand = df.nsmallest(100, "dist").reset_index(drop=True)
314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  if use_embeddings:
316
- model = _maybe_load_st_model()
317
- prompt = {"Low":"low risk conservative mix","Medium":"balanced moderate risk","High":"aggressive growth high risk"}[risk_level]
318
- texts = [prompt] + [_describe_row_for_embeddings(r, universe) for _, r in cand.iterrows()]
319
- embs = model.encode(texts)
320
- S = model.similarity(embs[0:1], embs[1:]).flatten()
321
- cand = cand.assign(sim=S).sort_values("sim", ascending=False).head(50).reset_index(drop=True)
322
-
323
- cand["score"] = cand["dist"] - 0.2*cand["er_p"]
324
- picks = cand.nsmallest(3, "score").reset_index(drop=True)
325
-
326
- suggestions = []
327
- for i, row in picks.iterrows():
328
- expo = _row_exposures(row, universe)
329
- if expo is None: continue
330
- wmap = {universe[j]: float(expo[j]) for j in range(len(universe)) if abs(float(expo[j])) > 1e-4}
331
- # recompute metrics using current moments (historical μ for plotting)
332
- beta_s, er_capm_s, sigma_s = portfolio_stats(wmap, covA, betas, rf_ann, erp_ann)
333
- mu_hist_s = portfolio_hist_return(wmap, mu_all_ann)
334
- # holdings table for this pick
335
- rows_hold = [{
336
- "pick": i+1,
337
- "ticker": t,
338
- "weight_%": round(w*100.0, 2),
339
- "amount_$": round(w*total_amount, 2)
340
- } for t, w in sorted(wmap.items(), key=lambda kv: -abs(kv[1]))]
341
- suggestions.append({
342
- "pick": i+1,
343
- "hold_df": pd.DataFrame(rows_hold, columns=SUG_COLS_HOLD),
344
- "mu_hist": mu_hist_s, "sigma_hist": sigma_s,
345
- "beta": beta_s, "er_capm": er_capm_s
 
346
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
- first_table = suggestions[0]["hold_df"] if suggestions else pd.DataFrame(columns=SUG_COLS_HOLD)
349
- return suggestions, first_table
350
-
351
- # ---------------- UI callbacks ----------------
352
  def search_tickers_cb(q: str):
353
  hits = yahoo_search(q)
354
- if not hits: return "No matches", []
 
355
  opts = [f"{h['symbol']} | {h['name']} | {h['exchange']}" for h in hits]
356
  return "Select a symbol and click Add", opts
357
 
358
  def add_symbol(selection: str, table: pd.DataFrame):
359
- if not selection: return table, "Pick a row from Matches first."
 
360
  symbol = selection.split("|")[0].strip().upper()
361
  current = [] if table is None or len(table) == 0 else [str(x).upper() for x in table["ticker"].tolist() if str(x) != "nan"]
362
  tickers = current if symbol in current else current + [symbol]
 
 
363
  val = validate_tickers(tickers, years=DEFAULT_LOOKBACK_YEARS)
364
  tickers = [t for t in tickers if t in val]
365
  amt_map = {}
@@ -371,7 +480,8 @@ def add_symbol(selection: str, table: pd.DataFrame):
371
  new_table = pd.DataFrame({"ticker": tickers, "amount_usd": [amt_map.get(t, 0.0) for t in tickers]})
372
  msg = f"Added {symbol}" if symbol in tickers else f"{symbol} not valid"
373
  if len(new_table) > MAX_TICKERS:
374
- new_table = new_table.iloc[:MAX_TICKERS]; msg = f"Reached max of {MAX_TICKERS}"
 
375
  return new_table, msg
376
 
377
  def lock_ticker_column(tb: pd.DataFrame):
@@ -384,165 +494,196 @@ def lock_ticker_column(tb: pd.DataFrame):
384
  amounts = amounts[:len(tickers)] + [0.0] * max(0, len(tickers) - len(amounts))
385
  return pd.DataFrame({"ticker": tickers, "amount_usd": amounts})
386
 
387
- HORIZON_YEARS = 10
388
- RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
389
- RF_ANN = fetch_fred_yield_annual(RF_CODE)
390
-
391
  def set_horizon(years: float):
392
  y = max(1.0, min(100.0, float(years)))
393
- code = fred_series_for_horizon(y); rf = fetch_fred_yield_annual(code)
 
394
  global HORIZON_YEARS, RF_CODE, RF_ANN
395
- HORIZON_YEARS = y; RF_CODE = code; RF_ANN = rf
396
- return f"Risk-free series {code}. Latest annual rate {fmt_pct(rf)}. Horizon set to {int(round(y))} years."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
- def compute(lookback_years: int,
399
- table: pd.DataFrame,
400
- risk_level: str,
401
- use_embeddings: bool):
402
  df = table.dropna()
403
  df["ticker"] = df["ticker"].astype(str).str.upper().str.strip()
404
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
405
-
406
  symbols = [t for t in df["ticker"].tolist() if t]
407
  if len(symbols) == 0:
408
- empty_hold = pd.DataFrame(columns=SUG_COLS_HOLD)
409
- empty_pos = pd.DataFrame(columns=POS_COLS)
410
- return None, "Add at least one ticker.", "—", empty_pos, empty_hold, None, [], {}
411
 
412
- symbols = validate_tickers(symbols, lookback_years)
413
  if len(symbols) == 0:
414
- empty_hold = pd.DataFrame(columns=SUG_COLS_HOLD)
415
- empty_pos = pd.DataFrame(columns=POS_COLS)
416
- return None, "Could not validate any tickers.", "—", empty_pos, empty_hold, None, [], {}
417
 
418
- universe = list(sorted(set(symbols + [MARKET_TICKER])))[:MAX_TICKERS]
419
 
420
  df = df[df["ticker"].isin(symbols)].copy()
421
  amounts = {r["ticker"]: float(r["amount_usd"]) for _, r in df.iterrows()}
422
- total_amt = float(sum(abs(v) for v in amounts.values()))
423
- if total_amt <= 1e-12:
424
- empty_hold = pd.DataFrame(columns=SUG_COLS_HOLD)
425
- empty_pos = pd.DataFrame(columns=POS_COLS)
426
- return None, "All amounts are zero.", f"Universe set to {', '.join(universe)}", empty_pos, empty_hold, None, [], {}
427
-
428
- weights = {k: v / total_amt for k, v in amounts.items()}
429
-
430
- moms = estimate_all_moments_aligned(universe, lookback_years, RF_ANN)
431
- betas, covA, erp_ann = moms["betas"], moms["cov_ann"], moms["erp_ann"]
432
- sigma_mkt, mu_all_ann = moms["sigma_m_ann"], moms["mu_all_ann"]
433
-
434
- beta_p, er_capm_p, sigma_p = portfolio_stats(weights, covA, betas, RF_ANN, erp_ann)
435
- mu_hist_p = portfolio_hist_return(weights, mu_all_ann) # use this for plotting
436
-
437
- a_sigma, b_sigma, mu_eff_sigma = efficient_same_sigma(sigma_p, RF_ANN, erp_ann, sigma_mkt)
438
- a_mu, b_mu, sigma_eff_mu = efficient_same_return(mu_hist_p, RF_ANN, erp_ann, sigma_mkt)
439
-
440
- # dataset for this universe
441
- csv_path = dataset_path_for_universe(universe)
442
- if not os.path.exists(csv_path):
443
- synth = build_synthetic_dataset(universe, lookback_years, RF_ANN, erp_ann, n_rows=DATASET_ROWS)
444
- ensure_dir(csv_path); synth.to_csv(csv_path, index=False)
445
-
446
- # suggestions list + first table
447
- suggestions, first_table = build_suggestions(
448
- csv_path, universe, total_amt, risk_level, use_embeddings,
449
- covA, betas, RF_ANN, erp_ann, mu_all_ann
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  )
451
 
452
- # plot state + initial image with first suggestion overlay
453
- plot_state = {
454
- "rf": RF_ANN, "erp": erp_ann, "sigma_m": sigma_mkt,
455
- "pt_sigma": sigma_p, "pt_mu": mu_hist_p,
456
- "same_sigma_sigma": sigma_p, "same_sigma_mu": mu_eff_sigma,
457
- "same_mu_sigma": sigma_eff_mu, "same_mu_mu": mu_hist_p
458
- }
459
- sug_overlay = {"sigma": suggestions[0]["sigma_hist"], "mu": suggestions[0]["mu_hist"]} if suggestions else None
460
- img = plot_cml_percent(plot_state, suggestion=sug_overlay)
461
-
462
- # summary text (show both CAPM and historical for your portfolio)
463
- info_lines = []
464
- info_lines += [
465
- "### Inputs",
466
- f"- Lookback years {int(lookback_years)}",
467
- f"- Horizon years {int(round(HORIZON_YEARS))}",
468
- f"- Risk-free {fmt_pct(RF_ANN)} from {RF_CODE}",
469
- f"- Market ERP {fmt_pct(erp_ann)}",
470
- f"- Market σ {fmt_pct(sigma_mkt)}",
471
- "",
472
- "### Your portfolio",
473
- f"- Beta {beta_p:.2f}",
474
- f"- σ (historical) {fmt_pct(sigma_p)}",
475
- f"- Expected return (historical) {fmt_pct(mu_hist_p)}",
476
- f"- Expected return (CAPM / SML) {fmt_pct(er_capm_p)}",
477
- "",
478
- "### Efficient alternatives on CML",
479
- f"- Same σ as your portfolio → Market {a_sigma:.2f}, Bills {b_sigma:.2f}, return {fmt_pct(mu_eff_sigma)}",
480
- f"- Same return (historical) → Market {a_mu:.2f}, Bills {b_mu:.2f}, σ {fmt_pct(sigma_eff_mu)}",
481
- "",
482
- f"### Dataset-based suggestions (risk: {risk_level})",
483
- "- Use the selector below to flip between Pick #1 / #2 / #3. Table shows % exposure and $ amounts."
484
- ]
485
- if use_embeddings:
486
- info_lines.append("- Reranked with finance embeddings (FinLang/finance-embeddings-investopedia).")
487
- info = "\n".join(info_lines)
488
 
489
  # positions table
490
  rows = []
491
  for t in symbols:
 
492
  rows.append({
493
  "ticker": t,
494
- "amount_usd": round(amounts.get(t, 0.0), 2),
495
- "weight_exposure": round(weights.get(t, 0.0), 6),
496
- "beta": round(betas.get(t, np.nan), 6),
497
  })
498
  pos_table = pd.DataFrame(rows, columns=POS_COLS)
499
 
500
- uni_msg = f"Universe set to: {', '.join(universe)}"
501
- # also return a short pick-info for pick #1
502
- pick_info = ""
503
- if suggestions:
504
- s = suggestions[0]
505
- pick_info = (f"**Pick #1** σ {fmt_pct(s['sigma_hist'])}, "
506
- f"ER (hist) {fmt_pct(s['mu_hist'])}, "
507
- f"ER (CAPM) {fmt_pct(s['er_capm'])}, beta {s['beta']:.2f}")
508
-
509
- return img, info, uni_msg, pos_table, first_table, csv_path, suggestions, plot_state, pick_info
510
-
511
- def change_pick(idx: int, suggestions, plot_state):
512
- # idx is 1..3
513
- if not suggestions or idx is None:
514
- return pd.DataFrame(columns=SUG_COLS_HOLD), plot_cml_percent(plot_state), ""
515
- i = int(idx) - 1
516
- if i < 0 or i >= len(suggestions):
517
- i = 0
518
- s = suggestions[i]
519
- img = plot_cml_percent(plot_state, suggestion={"sigma": s["sigma_hist"], "mu": s["mu_hist"]})
520
- pick_info = (f"**Pick #{idx}** — σ {fmt_pct(s['sigma_hist'])}, "
521
- f"ER (hist) {fmt_pct(s['mu_hist'])}, "
522
- f"ER (CAPM) {fmt_pct(s['er_capm'])}, beta {s['beta']:.2f}")
523
- return s["hold_df"], img, pick_info
524
-
525
- # ---------------- UI ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
526
  with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
527
  with gr.Accordion("About (assignment section 1)", open=False):
528
  gr.Markdown(
529
- "**Modality**: Text.\n\n"
530
- "**Use case**: Given a user’s stock/ETF universe and dollar amounts, the system recommends three "
531
- "alternative mixes (Low / Medium / High risk) drawn from a 1,000-row dataset generated from the user’s current universe.\n\n"
532
- "**System goal**: User inputs text (tickers & amounts). System returns three similar items (suggested mixes) from the dataset. "
533
- "Optional reranking uses the text-embedding model `FinLang/finance-embeddings-investopedia`."
 
534
  )
535
 
536
  gr.Markdown(
537
  "## Efficient Portfolio Advisor\n"
538
  "Search symbols, enter dollar amounts, set your horizon. Prices from Yahoo Finance. Risk-free from FRED. "
539
- "Suggestions are built only from your current universe and optionally refined with finance embeddings."
 
540
  )
541
 
542
  with gr.Row():
543
  with gr.Column(scale=1):
544
  q = gr.Textbox(label="Search symbol")
545
- search_note = gr.Markdown(" ")
546
  matches = gr.Dropdown(choices=[], label="Matches")
547
  with gr.Row():
548
  search_btn = gr.Button("Search")
@@ -552,8 +693,8 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
552
  table = gr.Dataframe(
553
  headers=["ticker", "amount_usd"],
554
  datatype=["str", "number"],
555
- row_count=0, col_count=(2, "fixed"),
556
- value=pd.DataFrame(columns=["ticker", "amount_usd"])
557
  )
558
 
559
  horizon = gr.Number(label="Horizon in years (1–100)", value=HORIZON_YEARS, precision=0)
@@ -561,41 +702,46 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
561
 
562
  gr.Markdown("### Suggestions")
563
  risk = gr.Radio(["Low", "Medium", "High"], value="Medium", label="Risk tolerance")
564
- use_st = gr.Checkbox(label="Use finance embeddings to refine picks", value=False)
565
- run_btn = gr.Button("Compute (build dataset & suggest)")
 
566
 
567
  with gr.Column(scale=1):
568
  plot = gr.Image(label="Capital Market Line (CML)", type="pil")
569
- summary = gr.Markdown(label="Summary")
570
  universe_msg = gr.Textbox(label="Universe status", interactive=False)
 
571
  positions = gr.Dataframe(
572
  label="Computed positions",
573
  headers=POS_COLS,
574
  datatype=["str", "number", "number", "number"],
575
  col_count=(len(POS_COLS), "fixed"),
576
- value=pd.DataFrame(columns=POS_COLS),
577
  interactive=False
578
  )
579
 
580
- # Suggestion picker
581
- pick_slider = gr.Slider(1, 3, value=1, step=1, label="View suggested mix #", interactive=True)
582
- pick_info = gr.Markdown("")
583
- suggestions_tbl = gr.Dataframe(
584
- label="Holdings (for selected pick) — percent & dollars",
585
- headers=SUG_COLS_HOLD,
586
- datatype=["number", "str", "number", "number"],
587
- col_count=(len(SUG_COLS_HOLD), "fixed"),
588
- value=pd.DataFrame(columns=SUG_COLS_HOLD),
 
589
  interactive=False
590
  )
591
- dl = gr.File(label="Generated dataset CSV", value=None, visible=True)
 
 
 
592
 
593
- # States to support picker
594
- sug_state = gr.State([])
595
- plot_state = gr.State({})
 
596
 
597
- # Wire up events
598
- def do_search(query): note, options = search_tickers_cb(query); return note, gr.update(choices=options)
599
  search_btn.click(fn=do_search, inputs=q, outputs=[search_note, matches])
600
  add_btn.click(fn=add_symbol, inputs=[matches, table], outputs=[table, search_note])
601
  table.change(fn=lock_ticker_column, inputs=table, outputs=table)
@@ -603,14 +749,14 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
603
 
604
  run_btn.click(
605
  fn=compute,
606
- inputs=[lookback, table, risk, use_st],
607
- outputs=[plot, summary, universe_msg, positions, suggestions_tbl, dl, sug_state, plot_state, pick_info]
608
  )
609
 
610
- pick_slider.change(
611
- fn=change_pick,
612
- inputs=[pick_slider, sug_state, plot_state],
613
- outputs=[suggestions_tbl, plot, pick_info]
614
  )
615
 
616
  if __name__ == "__main__":
 
1
  # app.py
2
+ import os, io, math, json, time, random, warnings
 
 
 
 
 
 
 
3
  warnings.filterwarnings("ignore")
4
 
5
  from typing import List, Tuple, Dict, Optional
 
8
  import pandas as pd
9
  import matplotlib.pyplot as plt
10
  from PIL import Image
 
11
  import requests
12
  import yfinance as yf
13
+ import gradio as gr
14
 
15
+ # Optional: finance embeddings for mild re-ranking of candidates
16
+ try:
17
+ from sentence_transformers import SentenceTransformer
18
+ _EMB_MODEL = "FinLang/finance-embeddings-investopedia"
19
+ _emb = SentenceTransformer(_EMB_MODEL)
20
+ except Exception:
21
+ _emb = None
22
+
23
+ # ---------------- config ----------------
24
+ DATA_DIR = "data"
25
+ os.makedirs(DATA_DIR, exist_ok=True)
26
 
 
 
 
 
27
  DEFAULT_LOOKBACK_YEARS = 10
28
+ MAX_TICKERS = 25
29
+ MARKET_TICKER = "VOO"
30
+
31
+ POS_COLS = ["ticker", "amount_usd", "weight_exposure", "beta"]
32
+ SUG_TABLE_COLS = ["ticker", "weight_%", "amount_$"]
33
 
34
  FRED_MAP = [
35
+ (1, "DGS1"),
36
+ (2, "DGS2"),
37
+ (3, "DGS3"),
38
+ (5, "DGS5"),
39
+ (7, "DGS7"),
40
+ (10, "DGS10"),
41
+ (20, "DGS20"),
42
+ (30, "DGS30"),
43
+ (100, "DGS30"),
44
  ]
45
 
46
+ # ---------------- helpers ----------------
47
+ def ensure_data_dir():
48
+ os.makedirs(DATA_DIR, exist_ok=True)
49
 
50
+ def empty_positions_df():
51
+ return pd.DataFrame(columns=POS_COLS)
 
 
52
 
53
+ def empty_suggest_df():
54
+ return pd.DataFrame(columns=SUG_TABLE_COLS)
55
 
56
  def fred_series_for_horizon(years: float) -> str:
57
  y = max(1.0, min(100.0, float(years)))
58
  for cutoff, code in FRED_MAP:
59
+ if y <= cutoff:
60
+ return code
61
  return "DGS30"
62
 
63
  def fetch_fred_yield_annual(code: str) -> float:
64
  url = f"https://fred.stlouisfed.org/graph/fredgraph.csv?id={code}"
65
  try:
66
+ r = requests.get(url, timeout=10)
67
+ r.raise_for_status()
68
  df = pd.read_csv(io.StringIO(r.text))
69
  s = pd.to_numeric(df.iloc[:, 1], errors="coerce").dropna()
70
  return float(s.iloc[-1] / 100.0) if len(s) else 0.03
71
+ except Exception:
72
+ return 0.03
73
+
74
+ def _extract_close(df: pd.DataFrame, tickers: List[str]) -> pd.DataFrame:
75
+ """
76
+ Make yfinance output consistently a (date x tickers) DataFrame of Close prices.
77
+ Handles single/multi ticker and (Adj Close|Close) cases.
78
+ """
79
+ if isinstance(df, pd.Series):
80
+ # Rare, but normalize
81
+ out = df.to_frame(name=tickers[0])
82
+ return out
83
+
84
+ if isinstance(df.columns, pd.MultiIndex):
85
+ lv0 = df.columns.get_level_values(0)
86
+ if "Close" in lv0:
87
+ px = df["Close"].copy()
88
+ elif "Adj Close" in lv0:
89
+ px = df["Adj Close"].copy()
90
+ else:
91
+ # Fallback to the first price-like level
92
+ first = next((x for x in ["Adj Close", "Close", "Close*"] if x in lv0), None)
93
+ if first is None:
94
+ first = lv0[0]
95
+ px = df[first].copy()
96
+ px.columns = [str(c) for c in px.columns]
97
+ return px
98
+
99
+ # Single ticker case with flat columns
100
+ candidates = [c for c in ["Close", "Adj Close"] if c in df.columns]
101
+ if candidates:
102
+ col = candidates[0]
103
+ return df[[col]].rename(columns={col: tickers[0]})
104
+
105
+ # Fallback: take first numeric column
106
+ first_num = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
107
+ if first_num:
108
+ out = df[[first_num[0]]].copy()
109
+ out.columns = [tickers[0]]
110
+ return out
111
+
112
+ raise ValueError("Could not extract a price column")
113
 
 
114
  def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
115
  start = pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=years, days=7)
116
  end = pd.Timestamp.today(tz="UTC")
117
+ df = yf.download(
118
  list(dict.fromkeys(tickers)),
119
+ start=start.date(),
120
+ end=end.date(),
121
+ interval="1mo",
122
+ auto_adjust=True,
123
+ progress=False,
124
+ group_by="column",
125
  )
126
+ px = _extract_close(df, tickers)
127
+ px = px.dropna(how="all").ffill()
128
+ return px
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
131
  return prices.pct_change().dropna()
132
 
133
+ def annualize_mean(m):
134
+ return np.asarray(m, dtype=float) * 12.0
135
+
136
+ def annualize_sigma(s):
137
+ return np.asarray(s, dtype=float) * math.sqrt(12.0)
138
 
 
139
  def yahoo_search(query: str):
140
+ if not query or len(query.strip()) == 0:
141
+ return []
142
  url = "https://query1.finance.yahoo.com/v1/finance/search"
143
  params = {"q": query.strip(), "quotesCount": 10, "newsCount": 0}
144
  headers = {"User-Agent": "Mozilla/5.0"}
145
  try:
146
  r = requests.get(url, params=params, headers=headers, timeout=10)
147
+ r.raise_for_status()
148
+ data = r.json()
149
  out = []
150
  for q in data.get("quotes", []):
151
  sym = q.get("symbol")
 
154
  if sym and sym.isascii():
155
  out.append({"symbol": sym, "name": name, "exchange": exch})
156
  if not out:
157
+ out = [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n a"}]
158
  return out[:10]
159
+ except Exception:
160
+ return [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n a"}]
161
 
162
  def validate_tickers(symbols: List[str], years: int) -> List[str]:
163
+ base = [s for s in dict.fromkeys(symbols) if s]
164
+ try:
165
+ px = fetch_prices_monthly(base + [MARKET_TICKER], years)
166
+ except Exception:
167
+ return []
168
  ok = [s for s in base if s in px.columns]
169
  return ok
170
 
171
+ # -------------- aligned moments --------------
172
  def get_aligned_monthly_returns(symbols: List[str], years: int) -> pd.DataFrame:
173
+ uniq = [c for c in dict.fromkeys(symbols) if c != MARKET_TICKER]
174
+ tickers = uniq + [MARKET_TICKER]
175
+ px = fetch_prices_monthly(tickers, years)
176
  rets = monthly_returns(px)
177
+ cols = [c for c in uniq if c in rets.columns] + ([MARKET_TICKER] if MARKET_TICKER in rets.columns else [])
178
  R = rets[cols].dropna(how="any")
179
  return R.loc[:, ~R.columns.duplicated()]
180
 
181
  def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
182
  R = get_aligned_monthly_returns(symbols + [MARKET_TICKER], years)
183
  if MARKET_TICKER not in R.columns or R.shape[0] < 3:
184
+ raise ValueError("Not enough aligned returns (market missing or few rows).")
185
  rf_m = rf_ann / 12.0
186
 
187
+ m = R[MARKET_TICKER]
188
+ if isinstance(m, pd.DataFrame):
189
+ m = m.iloc[:, 0].squeeze()
190
+
191
+ mu_m_ann = float(annualize_mean(m.mean()))
192
+ sigma_m_ann = float(annualize_sigma(m.std(ddof=1)))
193
  erp_ann = float(mu_m_ann - rf_ann)
194
 
195
+ ex_m = m - rf_m
196
+ var_m = float(np.var(ex_m.values, ddof=1))
197
+ var_m = max(var_m, 1e-8)
198
+
199
  betas: Dict[str, float] = {}
200
+ for s in [c for c in R.columns if c != MARKET_TICKER]:
201
  ex_s = R[s] - rf_m
202
  betas[s] = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1] / var_m)
 
203
 
204
+ betas[MARKET_TICKER] = 1.0 # by definition
 
 
205
 
206
+ asset_cols = [c for c in R.columns if c != MARKET_TICKER]
207
+ if asset_cols:
208
+ cov_m = np.cov(R[asset_cols].values.T, ddof=1)
209
+ covA = pd.DataFrame(cov_m * 12.0, index=asset_cols, columns=asset_cols)
210
+ else:
211
+ covA = pd.DataFrame([], index=[], columns=[])
212
+
213
+ return {"betas": betas, "cov_ann": covA, "erp_ann": erp_ann, "sigma_m_ann": sigma_m_ann}
214
 
215
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
216
  return float(rf_ann + beta * erp_ann)
 
221
  rf_ann: float,
222
  erp_ann: float) -> Tuple[float, float, float]:
223
  tickers = list(weights.keys())
 
224
  w = np.array([weights[t] for t in tickers], dtype=float)
225
+ gross = float(np.sum(np.abs(w)))
226
+ if gross == 0:
227
+ return 0.0, 0.0, 0.0
228
+ w_expo = w / gross
229
  beta_p = float(np.dot([betas.get(t, 0.0) for t in tickers], w_expo))
230
+ er_p = capm_er(beta_p, rf_ann, erp_ann) # CAPM expected return
 
231
  cov = cov_ann.reindex(index=tickers, columns=tickers).fillna(0.0).to_numpy()
232
  sigma_p = math.sqrt(float(max(w_expo.T @ cov @ w_expo, 0.0)))
233
  return beta_p, er_p, sigma_p
234
 
235
+ # -------------- CML helpers --------------
 
 
 
 
 
 
 
236
  def efficient_same_sigma(sigma_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
237
+ if sigma_mkt <= 1e-12:
238
+ return 0.0, 1.0, rf_ann
239
  a = sigma_target / sigma_mkt
240
  return a, 1.0 - a, rf_ann + a * erp_ann
241
 
242
  def efficient_same_return(mu_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
243
+ if abs(erp_ann) <= 1e-12:
244
+ return 0.0, 1.0, rf_ann
245
  a = (mu_target - rf_ann) / erp_ann
246
  return a, 1.0 - a, abs(a) * sigma_mkt
247
 
248
+ def _pct(x: float) -> float:
249
+ return float(x) * 100.0
250
+
251
+ def plot_cml(
252
+ rf_ann, erp_ann, sigma_mkt,
253
+ pt_sigma, pt_mu, # <-- portfolio CAPM point
254
+ same_sigma_sigma, same_sigma_mu,
255
+ same_mu_sigma, same_mu_mu,
256
+ sugg_sigma=None, sugg_mu=None
257
+ ) -> Image.Image:
258
+ fig = plt.figure(figsize=(6.4, 4.2), dpi=140)
259
+
260
+ xmax = max(0.30, sigma_mkt * 2.0, pt_sigma * 1.4, same_mu_sigma * 1.4, same_sigma_sigma * 1.4, (sugg_sigma or 0.0) * 1.4)
261
+ xs = np.linspace(0, xmax, 200)
262
+ slope = erp_ann / max(sigma_mkt, 1e-12)
263
  cml = rf_ann + slope * xs
264
+ plt.plot(_pct(xs), _pct(cml), label="CML via Market", linewidth=1.8)
265
+
266
+ # Key points
267
+ plt.scatter([0.0], [_pct(rf_ann)], label="Risk-free (FRED)")
268
+ plt.scatter([_pct(sigma_mkt)], [_pct(rf_ann + erp_ann)], label=f"Market {MARKET_TICKER}")
269
+ plt.scatter([_pct(pt_sigma)], [_pct(pt_mu)], label="Your portfolio (CAPM)")
270
+
271
+ plt.scatter([_pct(same_sigma_sigma)], [_pct(same_sigma_mu)], label="Efficient same σ")
272
+ plt.scatter([_pct(same_mu_sigma)], [_pct(same_mu_mu)], label="Efficient same return")
273
+ if sugg_sigma is not None and sugg_mu is not None:
274
+ plt.scatter([_pct(sugg_sigma)], [_pct(sugg_mu)], label="Suggestion")
275
+
276
+ plt.xlabel("σ (annualized, %)")
277
+ plt.ylabel("Expected return (annual, %)")
278
+ plt.legend(loc="best", fontsize=8)
279
+ plt.tight_layout()
280
+
281
+ buf = io.BytesIO()
282
+ plt.savefig(buf, format="png")
283
+ plt.close(fig)
284
+ buf.seek(0)
285
  return Image.open(buf)
286
 
287
+ # -------------- synthetic dataset for suggestions --------------
288
+ def synth_profile(rng: np.random.Generator) -> str:
289
+ risk = rng.choice(["cautious", "balanced", "moderate", "growth", "aggressive"])
290
+ horizon = rng.choice(["three years", "five years", "seven years", "ten years", "fifteen years"])
291
+ goal = rng.choice(["retirement savings", "first home", "education fund", "wealth building", "travel fund", "emergency buffer"])
292
+ return f"{risk} investor, {horizon} horizon, goal is {goal}."
 
 
 
 
 
293
 
294
+ def build_synthetic_dataset(universe: List[str], years: int, rf_ann: float, erp_ann: float, covA: pd.DataFrame, betas: Dict[str, float]) -> pd.DataFrame:
295
+ # build 1,000 random portfolios over the user universe (CAPM ER, cov-based sigma)
296
+ rng = np.random.default_rng(42 + int(time.time()) % 10000)
297
+ rows = []
298
+ for i in range(1000):
299
+ k = rng.integers(low=min(2, len(universe)), high=min(8, len(universe)) + 1)
 
300
  picks = list(rng.choice(universe, size=k, replace=False))
301
+ signs = rng.choice([-1.0, 1.0], size=k, p=[0.25, 0.75])
302
  raw = rng.dirichlet(np.ones(k))
303
  gross = 1.0 + float(rng.gamma(2.0, 0.5))
304
+ w = gross * signs * raw # exposure weights that sum (in abs) to gross
305
+
306
+ wmap = {picks[j]: float(w[j]) for j in range(k)}
307
+ beta_p, er_p, sigma_p = portfolio_stats(wmap, covA, betas, rf_ann, erp_ann)
308
+
309
  rows.append({
310
  "id": i,
311
+ "profile_text": synth_profile(rng),
312
  "tickers": ",".join(picks),
313
  "weights": ",".join(f"{x:.6f}" for x in w),
314
+ "beta_p": beta_p,
315
+ "er_p": er_p,
316
+ "sigma_p": sigma_p
317
  })
318
  return pd.DataFrame(rows)
319
 
320
+ def _row_to_exposures(row: pd.Series, universe: List[str]) -> Optional[np.ndarray]:
321
+ try:
322
+ ts = [t.strip() for t in str(row["tickers"]).split(",")]
323
+ ws = [float(x) for x in str(row["weights"]).split(",")]
324
+ wmap = {t: ws[i] for i, t in enumerate(ts) if i < len(ws)}
325
+ w = np.array([wmap.get(t, 0.0) for t in universe], dtype=float)
326
+ gross = float(np.sum(np.abs(w)))
327
+ if gross <= 1e-12:
328
+ return None
329
+ return w / gross
330
+ except Exception:
331
+ return None
332
+
333
+ def _risk_query_text(risk: str) -> str:
334
+ if risk == "Low":
335
+ return "conservative low-volatility long-term capital preservation diversified investment grade"
336
+ if risk == "High":
337
+ return "aggressive high risk high growth momentum speculative tech heavy"
338
+ return "balanced moderate risk growth and income diversified core equities and bonds"
339
+
340
+ def _embed_scores(texts: List[str], query: str) -> np.ndarray:
341
+ if _emb is None:
342
+ return np.zeros(len(texts), dtype=float)
343
+ qv = _emb.encode([query], normalize_embeddings=True)[0]
344
+ M = _emb.encode(texts, normalize_embeddings=True)
345
+ sims = (M @ qv).astype(float)
346
+ return sims
347
+
348
+ def make_suggestions(csv_path: str,
349
+ universe: List[str],
350
+ risk: str,
351
+ use_embeddings: bool) -> List[Dict]:
352
+ """
353
+ Return a list of 3 suggestions. Each item:
354
+ {"weights": {ticker: expo}, "er": float, "sigma": float, "beta": float, "row_text": str}
355
+ """
356
+ try:
357
+ df = pd.read_csv(csv_path)
358
+ except Exception:
359
+ return []
 
 
 
 
 
 
360
 
361
+ # Keep only rows that map nicely to current universe
362
+ rows = []
363
+ exps = []
364
+ for _, r in df.iterrows():
365
+ x = _row_to_exposures(r, universe)
366
+ if x is None:
367
+ continue
368
+ rows.append(r)
369
+ exps.append(x)
370
+ if not rows:
371
+ return []
372
+
373
+ exps = np.vstack(exps)
374
+ sigs = np.array([float(r["sigma_p"]) for r in rows])
375
+ ers = np.array([float(r["er_p"]) for r in rows])
376
+
377
+ # Choose a target sigma by risk quantile
378
+ qmap = {"Low": 0.25, "Medium": 0.50, "High": 0.85}
379
+ q = qmap.get(risk, 0.50)
380
+ target_sigma = float(np.quantile(sigs, q=q))
381
+
382
+ # Rank by closeness in sigma to target
383
+ base_idx = np.argsort(np.abs(sigs - target_sigma))
384
+
385
+ # Optional: light re-ranking using embeddings to prefer text that matches risk intent
386
  if use_embeddings:
387
+ texts = [str(rows[i]["profile_text"]) for i in base_idx[:120]]
388
+ sims = _embed_scores(texts, _risk_query_text(risk))
389
+ # Blend: 80% sigma closeness (smaller better) and -20% similarity (larger better)
390
+ closeness = np.abs(sigs[base_idx[:120]] - target_sigma)
391
+ score = 0.8 * (closeness / (closeness.max() + 1e-9)) - 0.2 * sims
392
+ rerank_local = np.argsort(score)
393
+ idx = base_idx[:120][rerank_local]
394
+ else:
395
+ idx = base_idx
396
+
397
+ # Take top 3 diverse by exposure distance
398
+ picks, chosen = [], []
399
+ for i in idx:
400
+ wvec = exps[i]
401
+ # enforce some diversity
402
+ ok = True
403
+ for j in chosen:
404
+ if np.linalg.norm(wvec - exps[j]) < 0.25:
405
+ ok = False
406
+ break
407
+ if not ok:
408
+ continue
409
+ chosen.append(i)
410
+ r = rows[i]
411
+ wmap = {universe[k]: float(wvec[k]) for k in range(len(universe)) if abs(wvec[k]) > 1e-4}
412
+ picks.append({
413
+ "weights": wmap,
414
+ "er": float(r["er_p"]),
415
+ "sigma": float(r["sigma_p"]),
416
+ "beta": float(r["beta_p"]),
417
+ "row_text": str(r["profile_text"])
418
  })
419
+ if len(picks) == 3:
420
+ break
421
+ return picks
422
+
423
+ # -------------- formatting helpers --------------
424
+ def fmt_pct(x: float, dp: int = 2) -> str:
425
+ return f"{x*100:.{dp}f}%"
426
+
427
+ def build_summary_md(lookback, horizon, rf, rf_code, erp, sigma_mkt,
428
+ beta_p, sigma_hist, mu_hist, mu_capm,
429
+ a_sigma, b_sigma, mu_eff_sigma,
430
+ a_mu, b_mu, sigma_eff_mu) -> str:
431
+ lines = []
432
+ lines.append("### Inputs")
433
+ lines.append(f"- Lookback years **{lookback}**")
434
+ lines.append(f"- Horizon years **{horizon}**")
435
+ lines.append(f"- Risk-free **{fmt_pct(rf)}** from **{rf_code}**")
436
+ lines.append(f"- Market ERP **{fmt_pct(erp)}**")
437
+ lines.append(f"- Market σ **{fmt_pct(sigma_mkt)}**")
438
+ lines.append("")
439
+ lines.append("### Your portfolio (CAPM expectations)")
440
+ lines.append(f"- Beta **{beta_p:.2f}**")
441
+ lines.append(f"- σ (historical) **{fmt_pct(sigma_hist)}**")
442
+ lines.append(f"- Expected return (historical) **{fmt_pct(mu_hist)}**")
443
+ lines.append(f"- Expected return (CAPM / SML) **{fmt_pct(mu_capm)}**")
444
+ lines.append("")
445
+ lines.append("### Efficient alternatives on CML")
446
+ lines.append(f"- Same σ as your portfolio → Market weight **{a_sigma:.2f}**, Bills weight **{b_sigma:.2f}**, return **{fmt_pct(mu_eff_sigma)}**")
447
+ lines.append(f"- Same return (CAPM) → Market weight **{a_mu:.2f}**, Bills weight **{b_mu:.2f}**, σ **{fmt_pct(sigma_eff_mu)}**")
448
+ return "\n".join(lines)
449
+
450
+ # -------------- stateful globals on launch --------------
451
+ ensure_data_dir()
452
+ HORIZON_YEARS = 10
453
+ RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
454
+ RF_ANN = fetch_fred_yield_annual(RF_CODE)
455
 
456
+ # -------------- gradio callbacks --------------
 
 
 
457
  def search_tickers_cb(q: str):
458
  hits = yahoo_search(q)
459
+ if not hits:
460
+ return "No matches", []
461
  opts = [f"{h['symbol']} | {h['name']} | {h['exchange']}" for h in hits]
462
  return "Select a symbol and click Add", opts
463
 
464
  def add_symbol(selection: str, table: pd.DataFrame):
465
+ if not selection:
466
+ return table, "Pick a row from Matches first"
467
  symbol = selection.split("|")[0].strip().upper()
468
  current = [] if table is None or len(table) == 0 else [str(x).upper() for x in table["ticker"].tolist() if str(x) != "nan"]
469
  tickers = current if symbol in current else current + [symbol]
470
+
471
+ # validate against yfinance (with market ticker alongside to force download structure)
472
  val = validate_tickers(tickers, years=DEFAULT_LOOKBACK_YEARS)
473
  tickers = [t for t in tickers if t in val]
474
  amt_map = {}
 
480
  new_table = pd.DataFrame({"ticker": tickers, "amount_usd": [amt_map.get(t, 0.0) for t in tickers]})
481
  msg = f"Added {symbol}" if symbol in tickers else f"{symbol} not valid"
482
  if len(new_table) > MAX_TICKERS:
483
+ new_table = new_table.iloc[:MAX_TICKERS]
484
+ msg = f"Reached max of {MAX_TICKERS}"
485
  return new_table, msg
486
 
487
  def lock_ticker_column(tb: pd.DataFrame):
 
494
  amounts = amounts[:len(tickers)] + [0.0] * max(0, len(tickers) - len(amounts))
495
  return pd.DataFrame({"ticker": tickers, "amount_usd": amounts})
496
 
 
 
 
 
497
  def set_horizon(years: float):
498
  y = max(1.0, min(100.0, float(years)))
499
+ code = fred_series_for_horizon(y)
500
+ rf = fetch_fred_yield_annual(code)
501
  global HORIZON_YEARS, RF_CODE, RF_ANN
502
+ HORIZON_YEARS = int(round(y))
503
+ RF_CODE = code
504
+ RF_ANN = rf
505
+ return f"Risk free series {code}. Latest annual rate {rf:.2%}. Using this for CAPM."
506
+
507
+ def _build_dataset_path() -> str:
508
+ return os.path.join(DATA_DIR, f"investor_profiles_{hex(random.getrandbits(32))[2:]}.csv")
509
+
510
+ def compute(
511
+ years_lookback: int,
512
+ table: pd.DataFrame,
513
+ risk_choice: str,
514
+ use_embeddings: bool
515
+ ):
516
+ # --- sanitize input table ---
517
+ if table is None or len(table) == 0:
518
+ return None, "Add at least one ticker", "Universe empty", empty_positions_df(), gr.update(choices=[], value=None), empty_suggest_df(), None, {}
519
 
 
 
 
 
520
  df = table.dropna()
521
  df["ticker"] = df["ticker"].astype(str).str.upper().str.strip()
522
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
 
523
  symbols = [t for t in df["ticker"].tolist() if t]
524
  if len(symbols) == 0:
525
+ return None, "Add at least one ticker", "Universe empty", empty_positions_df(), gr.update(choices=[], value=None), empty_suggest_df(), None, {}
 
 
526
 
527
+ symbols = validate_tickers(symbols, years_lookback)
528
  if len(symbols) == 0:
529
+ return None, "Could not validate any tickers", "Universe invalid", empty_positions_df(), gr.update(choices=[], value=None), empty_suggest_df(), None, {}
 
 
530
 
531
+ universe = list(sorted(set([s for s in symbols if s != MARKET_TICKER] + [MARKET_TICKER])))[:MAX_TICKERS]
532
 
533
  df = df[df["ticker"].isin(symbols)].copy()
534
  amounts = {r["ticker"]: float(r["amount_usd"]) for _, r in df.iterrows()}
535
+ gross = sum(abs(v) for v in amounts.values())
536
+ if gross <= 1e-12:
537
+ return None, "All amounts are zero", "Universe ok", empty_positions_df(), gr.update(choices=[], value=None), empty_suggest_df(), None, {}
538
+
539
+ # --- CAPM ingredients ---
540
+ rf_ann = RF_ANN
541
+ moms = estimate_all_moments_aligned(symbols, years_lookback, rf_ann)
542
+ betas, covA, erp_ann, sigma_mkt = moms["betas"], moms["cov_ann"], moms["erp_ann"], moms["sigma_m_ann"]
543
+
544
+ # portfolio weights/exposures
545
+ weights = {k: v / gross for k, v in amounts.items()}
546
+ beta_p, mu_capm, sigma_p = portfolio_stats(weights, covA, betas, rf_ann, erp_ann)
547
+
548
+ # historical mean (for info only)
549
+ try:
550
+ R = get_aligned_monthly_returns(symbols, years_lookback)
551
+ mu_hist = float(annualize_mean(R[symbols].mean().dot(np.array([weights[s] for s in symbols]))))
552
+ sigma_hist = sigma_p # same sigma as built from covA
553
+ except Exception:
554
+ mu_hist = mu_capm
555
+ sigma_hist = sigma_p
556
+
557
+ # efficient points on CML (use CAPM target)
558
+ a_sigma, b_sigma, mu_eff_sigma = efficient_same_sigma(sigma_p, rf_ann, erp_ann, sigma_mkt)
559
+ a_mu, b_mu, sigma_eff_mu = efficient_same_return(mu_capm, rf_ann, erp_ann, sigma_mkt)
560
+
561
+ # --- Build dataset once for this run (universe-specific) ---
562
+ ds_path = _build_dataset_path()
563
+ synth_df = build_synthetic_dataset(
564
+ universe=[u for u in universe if u != MARKET_TICKER],
565
+ years=years_lookback,
566
+ rf_ann=rf_ann,
567
+ erp_ann=erp_ann,
568
+ covA=covA,
569
+ betas=betas
570
+ )
571
+ synth_df.to_csv(ds_path, index=False)
572
+
573
+ # --- Suggestions (3 picks) ---
574
+ picks = make_suggestions(ds_path, [u for u in universe if u != MARKET_TICKER], risk_choice, use_embeddings)
575
+ if not picks:
576
+ pick_choices = []
577
+ sugg_table = empty_suggest_df()
578
+ sugg_sigma = None
579
+ sugg_mu = None
580
+ else:
581
+ pick_choices = [f"Pick #{i+1}" for i in range(len(picks))]
582
+ # default selection = first pick
583
+ first = picks[0]
584
+ sugg_sigma = float(first["sigma"])
585
+ sugg_mu = float(first["er"])
586
+ sugg_table = _pick_table(first, amounts)
587
+
588
+ # --- Plot with CAPM portfolio and suggestion point (if any) ---
589
+ img = plot_cml(
590
+ rf_ann, erp_ann, sigma_mkt,
591
+ pt_sigma=sigma_p, pt_mu=mu_capm,
592
+ same_sigma_sigma=sigma_p, same_sigma_mu=mu_eff_sigma,
593
+ same_mu_sigma=sigma_eff_mu, same_mu_mu=mu_capm,
594
+ sugg_sigma=sugg_sigma, sugg_mu=sugg_mu
595
  )
596
 
597
+ # --- Summary text ---
598
+ summary = build_summary_md(
599
+ years_lookback, HORIZON_YEARS, rf_ann, RF_CODE, erp_ann, sigma_mkt,
600
+ beta_p, sigma_hist, mu_hist, mu_capm,
601
+ a_sigma, b_sigma, mu_eff_sigma,
602
+ a_mu, b_mu, sigma_eff_mu
603
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
 
605
  # positions table
606
  rows = []
607
  for t in symbols:
608
+ beta_val = 1.0 if t == MARKET_TICKER else betas.get(t, np.nan)
609
  rows.append({
610
  "ticker": t,
611
+ "amount_usd": amounts.get(t, 0.0),
612
+ "weight_exposure": weights.get(t, 0.0),
613
+ "beta": beta_val,
614
  })
615
  pos_table = pd.DataFrame(rows, columns=POS_COLS)
616
 
617
+ uni_msg = f"Universe set to {', '.join(universe)}"
618
+ # Return suggestions state so the picker can swap views
619
+ suggestions_state = {"picks": picks, "amounts": amounts, "rf": rf_ann, "erp": erp_ann, "sigma_mkt": sigma_mkt, "mu_capm": mu_capm, "sigma_p": sigma_p}
620
+ return img, summary, uni_msg, pos_table, gr.update(choices=pick_choices, value=(pick_choices[0] if pick_choices else None), interactive=bool(pick_choices)), sugg_table, ds_path, suggestions_state
621
+
622
+ def _pick_table(pick: Dict, amounts_map: Dict[str, float]) -> pd.DataFrame:
623
+ gross = float(sum(abs(v) for v in amounts_map.values()))
624
+ wmap = pick["weights"]
625
+ # normalize to exposures sum of abs = 1 for display
626
+ gross_w = sum(abs(v) for v in wmap.values())
627
+ if gross_w <= 1e-12:
628
+ return empty_suggest_df()
629
+ w_norm = {k: v / gross_w for k, v in wmap.items()}
630
+ rows = []
631
+ for t, w in sorted(w_norm.items(), key=lambda kv: -abs(kv[1])):
632
+ rows.append({
633
+ "ticker": t,
634
+ "weight_%": 100.0 * float(w),
635
+ "amount_$": float(w) * gross
636
+ })
637
+ df = pd.DataFrame(rows, columns=SUG_TABLE_COLS)
638
+ return df
639
+
640
+ def on_select_pick(choice: Optional[str], suggestions_state: Dict):
641
+ if not choice or not suggestions_state or not suggestions_state.get("picks"):
642
+ return empty_suggest_df(), gr.update(value=None)
643
+ idx = int(choice.split("#")[1]) - 1
644
+ idx = max(0, min(idx, len(suggestions_state["picks"]) - 1))
645
+ pick = suggestions_state["picks"][idx]
646
+ table = _pick_table(pick, suggestions_state["amounts"])
647
+
648
+ # Update the plot with the chosen suggestion dot
649
+ img = plot_cml(
650
+ suggestions_state["rf"],
651
+ suggestions_state["erp"],
652
+ suggestions_state["sigma_mkt"],
653
+ pt_sigma=suggestions_state["sigma_p"],
654
+ pt_mu=suggestions_state["mu_capm"],
655
+ same_sigma_sigma=suggestions_state["sigma_p"],
656
+ same_sigma_mu=efficient_same_sigma(suggestions_state["sigma_p"], suggestions_state["rf"], suggestions_state["erp"], suggestions_state["sigma_mkt"])[2],
657
+ same_mu_sigma=efficient_same_return(suggestions_state["mu_capm"], suggestions_state["rf"], suggestions_state["erp"], suggestions_state["sigma_mkt"])[2],
658
+ same_mu_mu=suggestions_state["mu_capm"],
659
+ sugg_sigma=float(pick["sigma"]),
660
+ sugg_mu=float(pick["er"]),
661
+ )
662
+ return table, img
663
+
664
+ # -------------- UI --------------
665
  with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
666
  with gr.Accordion("About (assignment section 1)", open=False):
667
  gr.Markdown(
668
+ "**Modality:** Text\n\n"
669
+ "**Model:** FinLang/finance-embeddings-investopedia (optional, for mild re-ranking of dataset suggestions).\n\n"
670
+ "**Use case:** User enters tickers and dollar amounts; the app computes CAPM expectations and shows the "
671
+ "Capital Market Line. From a synthetic dataset (1,000 portfolios generated over the user’s universe), "
672
+ "the system returns 3 similar portfolios (Low/Medium/High risk picks). The user can flip between the "
673
+ "suggested picks and see holdings in % and $ plus where the suggestion sits on the CML.\n"
674
  )
675
 
676
  gr.Markdown(
677
  "## Efficient Portfolio Advisor\n"
678
  "Search symbols, enter dollar amounts, set your horizon. Prices from Yahoo Finance. Risk-free from FRED. "
679
+ "Low/Medium/High suggestions are chosen only from a 1,000-row dataset generated from your current universe, "
680
+ "optionally refined with finance embeddings."
681
  )
682
 
683
  with gr.Row():
684
  with gr.Column(scale=1):
685
  q = gr.Textbox(label="Search symbol")
686
+ search_note = gr.Markdown()
687
  matches = gr.Dropdown(choices=[], label="Matches")
688
  with gr.Row():
689
  search_btn = gr.Button("Search")
 
693
  table = gr.Dataframe(
694
  headers=["ticker", "amount_usd"],
695
  datatype=["str", "number"],
696
+ row_count=0,
697
+ col_count=(2, "fixed")
698
  )
699
 
700
  horizon = gr.Number(label="Horizon in years (1–100)", value=HORIZON_YEARS, precision=0)
 
702
 
703
  gr.Markdown("### Suggestions")
704
  risk = gr.Radio(["Low", "Medium", "High"], value="Medium", label="Risk tolerance")
705
+ use_emb = gr.Checkbox(label="Use finance embeddings to refine picks", value=True)
706
+
707
+ run_btn = gr.Button("Compute (build dataset & suggest)", variant="primary")
708
 
709
  with gr.Column(scale=1):
710
  plot = gr.Image(label="Capital Market Line (CML)", type="pil")
711
+ summary = gr.Markdown(label="Inputs & CAPM expectations")
712
  universe_msg = gr.Textbox(label="Universe status", interactive=False)
713
+
714
  positions = gr.Dataframe(
715
  label="Computed positions",
716
  headers=POS_COLS,
717
  datatype=["str", "number", "number", "number"],
718
  col_count=(len(POS_COLS), "fixed"),
719
+ value=empty_positions_df(),
720
  interactive=False
721
  )
722
 
723
+ with gr.Row():
724
+ with gr.Column(scale=1):
725
+ pick_select = gr.Radio(choices=[], label="Suggested pick (flip between #1 / #2 / #3)", interactive=False)
726
+ with gr.Column(scale=1):
727
+ sugg_table = gr.Dataframe(
728
+ label="Suggestion holdings — % and $",
729
+ headers=SUG_TABLE_COLS,
730
+ datatype=["str", "number", "number"],
731
+ col_count=(len(SUG_TABLE_COLS), "fixed"),
732
+ value=empty_suggest_df(),
733
  interactive=False
734
  )
735
+ dl = gr.File(label="Generated dataset CSV", value=None, visible=True)
736
+
737
+ # hidden state for suggestions
738
+ suggestions_state = gr.State({})
739
 
740
+ # wire events
741
+ def do_search(query):
742
+ note, options = search_tickers_cb(query)
743
+ return note, gr.update(choices=options)
744
 
 
 
745
  search_btn.click(fn=do_search, inputs=q, outputs=[search_note, matches])
746
  add_btn.click(fn=add_symbol, inputs=[matches, table], outputs=[table, search_note])
747
  table.change(fn=lock_ticker_column, inputs=table, outputs=table)
 
749
 
750
  run_btn.click(
751
  fn=compute,
752
+ inputs=[lookback, table, risk, use_emb],
753
+ outputs=[plot, summary, universe_msg, positions, pick_select, sugg_table, dl, suggestions_state]
754
  )
755
 
756
+ pick_select.change(
757
+ fn=on_select_pick,
758
+ inputs=[pick_select, suggestions_state],
759
+ outputs=[sugg_table, plot]
760
  )
761
 
762
  if __name__ == "__main__":