Tulitula commited on
Commit
fb8592f
·
verified ·
1 Parent(s): d020540

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +259 -308
app.py CHANGED
@@ -1,7 +1,6 @@
1
  # app.py
2
- # Efficient Portfolio Advisor — with dataset-based Low/Medium/High suggestions
3
- # Modality: Text. Models: yfinance (prices), FRED (risk-free), simple CAPM math,
4
- # optional reranking with sentence-transformers "FinLang/finance-embeddings-investopedia".
5
 
6
  import os
7
  import io
@@ -16,50 +15,31 @@ import numpy as np
16
  import pandas as pd
17
  import matplotlib.pyplot as plt
18
  from PIL import Image
19
- import requests
20
  import gradio as gr
 
21
  import yfinance as yf
22
 
23
- # Optional (lazy) import for embeddings
24
- _ST_MODEL = None
25
-
26
- # ---------- Config ----------
27
- DATA_DIR = "data"
28
- os.makedirs(DATA_DIR, exist_ok=True)
29
 
 
 
30
  MARKET_TICKER = "VOO"
31
  MAX_TICKERS = 30
32
  DEFAULT_LOOKBACK_YEARS = 10
33
  DATASET_ROWS = 1000
34
 
35
- # FRED mappings by horizon
36
  FRED_MAP = [
37
- (1, "DGS1"),
38
- (2, "DGS2"),
39
- (3, "DGS3"),
40
- (5, "DGS5"),
41
- (7, "DGS7"),
42
- (10, "DGS10"),
43
- (20, "DGS20"),
44
- (30, "DGS30"),
45
- (100, "DGS30"),
46
  ]
47
 
48
  POS_COLS = ["ticker", "amount_usd", "weight_exposure", "beta"]
49
  SUG_COLS_HOLD = ["pick", "ticker", "weight_%", "amount_$"]
50
 
51
- # ---------- Small helpers ----------
52
  def fmt_pct(x: float, dec: int = 2) -> str:
53
- try:
54
- return f"{x*100:.{dec}f}%"
55
- except Exception:
56
- return "—"
57
-
58
- def fmt_usd(x: float) -> str:
59
- try:
60
- return f"${x:,.2f}"
61
- except Exception:
62
- return "—"
63
 
64
  def ensure_dir(p: str):
65
  os.makedirs(os.path.dirname(p), exist_ok=True)
@@ -67,60 +47,45 @@ def ensure_dir(p: str):
67
  def fred_series_for_horizon(years: float) -> str:
68
  y = max(1.0, min(100.0, float(years)))
69
  for cutoff, code in FRED_MAP:
70
- if y <= cutoff:
71
- return code
72
  return "DGS30"
73
 
74
  def fetch_fred_yield_annual(code: str) -> float:
75
  url = f"https://fred.stlouisfed.org/graph/fredgraph.csv?id={code}"
76
  try:
77
- r = requests.get(url, timeout=10)
78
- r.raise_for_status()
79
  df = pd.read_csv(io.StringIO(r.text))
80
  s = pd.to_numeric(df.iloc[:, 1], errors="coerce").dropna()
81
  return float(s.iloc[-1] / 100.0) if len(s) else 0.03
82
- except Exception:
83
- return 0.03
84
 
85
- # ---------- Prices & returns (fix for 'Close' KeyError) ----------
86
  def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
87
  start = pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=years, days=7)
88
  end = pd.Timestamp.today(tz="UTC")
89
-
90
  raw = yf.download(
91
  list(dict.fromkeys(tickers)),
92
- start=start.date(),
93
- end=end.date(),
94
- interval="1mo",
95
- auto_adjust=False, # prefer 'Adj Close' if present
96
- progress=False,
97
- group_by="ticker",
98
- threads=False,
99
  )
100
  if raw is None or len(raw) == 0:
101
  return pd.DataFrame()
102
 
103
- # MultiIndex (ticker, field) vs single-index
104
  if isinstance(raw.columns, pd.MultiIndex):
105
  price = None
106
  for field in ("Adj Close", "Close"):
107
  if field in raw.columns.get_level_values(-1):
108
- price = raw.xs(field, axis=1, level=-1, drop_level=True)
109
- break
110
  if price is None:
111
  price = raw.copy()
112
  price.columns = [c[0] if isinstance(c, tuple) else c for c in price.columns]
113
  else:
114
- if "Adj Close" in raw.columns:
115
- price = raw["Adj Close"]
116
- elif "Close" in raw.columns:
117
- price = raw["Close"]
118
- else:
119
- price = raw
120
-
121
- if isinstance(price, pd.Series):
122
- price = price.to_frame()
123
 
 
124
  price = price.dropna(how="all").fillna(method="ffill")
125
  price = price.loc[:, ~pd.Index(price.columns).duplicated()]
126
  return price
@@ -128,23 +93,18 @@ def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
128
  def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
129
  return prices.pct_change().dropna()
130
 
131
- def annualize_mean(m):
132
- return np.asarray(m, dtype=float) * 12.0
133
-
134
- def annualize_sigma(s):
135
- return np.asarray(s, dtype=float) * math.sqrt(12.0)
136
 
137
- # ---------- Search & validation ----------
138
  def yahoo_search(query: str):
139
- if not query or not query.strip():
140
- return []
141
  url = "https://query1.finance.yahoo.com/v1/finance/search"
142
  params = {"q": query.strip(), "quotesCount": 10, "newsCount": 0}
143
  headers = {"User-Agent": "Mozilla/5.0"}
144
  try:
145
  r = requests.get(url, params=params, headers=headers, timeout=10)
146
- r.raise_for_status()
147
- data = r.json()
148
  out = []
149
  for q in data.get("quotes", []):
150
  sym = q.get("symbol")
@@ -155,55 +115,56 @@ def yahoo_search(query: str):
155
  if not out:
156
  out = [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "—"}]
157
  return out[:10]
158
- except Exception:
159
  return [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "—"}]
160
 
161
  def validate_tickers(symbols: List[str], years: int) -> List[str]:
162
- # include market to keep alignment, but validate only user symbols
163
  base = list(dict.fromkeys([s.strip().upper() for s in symbols if s.strip()]))[:MAX_TICKERS]
164
  px = fetch_prices_monthly(base + [MARKET_TICKER], years)
165
  ok = [s for s in base if s in px.columns]
166
  return ok
167
 
168
- # ---------- Aligned CAPM moments ----------
169
  def get_aligned_monthly_returns(symbols: List[str], years: int) -> pd.DataFrame:
170
- uniq = [c for c in dict.fromkeys(symbols) if c != MARKET_TICKER]
171
- tickers = uniq + [MARKET_TICKER]
172
- px = fetch_prices_monthly(tickers, years)
173
  rets = monthly_returns(px)
174
- cols = [c for c in uniq if c in rets.columns] + ([MARKET_TICKER] if MARKET_TICKER in rets.columns else [])
175
  R = rets[cols].dropna(how="any")
176
  return R.loc[:, ~R.columns.duplicated()]
177
 
178
  def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
179
- R = get_aligned_monthly_returns(symbols, years)
180
  if MARKET_TICKER not in R.columns or R.shape[0] < 3:
181
  raise ValueError("Not enough aligned data to estimate moments.")
182
  rf_m = rf_ann / 12.0
183
 
184
- m = R[MARKET_TICKER]
185
- if isinstance(m, pd.DataFrame):
186
- m = m.iloc[:, 0].squeeze()
187
-
188
- mu_m_ann = float(annualize_mean(m.mean()))
189
- sigma_m_ann = float(annualize_sigma(m.std(ddof=1)))
190
  erp_ann = float(mu_m_ann - rf_ann)
191
 
192
- ex_m = m - rf_m
193
- var_m = float(np.var(ex_m.values, ddof=1))
194
- var_m = max(var_m, 1e-6)
195
-
196
  betas: Dict[str, float] = {}
197
- for s in [c for c in R.columns if c != MARKET_TICKER]:
198
  ex_s = R[s] - rf_m
199
  betas[s] = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1] / var_m)
200
  betas[MARKET_TICKER] = 1.0
201
 
202
- asset_cols = [c for c in R.columns if c != MARKET_TICKER]
203
- cov_m = np.cov(R[asset_cols].values.T, ddof=1) if asset_cols else np.zeros((0, 0))
204
- covA = pd.DataFrame(cov_m * 12.0, index=asset_cols, columns=asset_cols)
205
 
206
- return {"betas": betas, "cov_ann": covA, "erp_ann": erp_ann, "sigma_m_ann": sigma_m_ann}
 
 
 
 
 
 
207
 
208
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
209
  return float(rf_ann + beta * erp_ann)
@@ -214,99 +175,80 @@ def portfolio_stats(weights: Dict[str, float],
214
  rf_ann: float,
215
  erp_ann: float) -> Tuple[float, float, float]:
216
  tickers = list(weights.keys())
217
- if len(tickers) == 0:
218
- return 0.0, 0.0, 0.0
219
  w = np.array([weights[t] for t in tickers], dtype=float)
220
- gross = float(np.sum(np.abs(w)))
221
- if gross <= 1e-12:
222
- return 0.0, 0.0, 0.0
223
- w_expo = w / gross
224
  beta_p = float(np.dot([betas.get(t, 0.0) for t in tickers], w_expo))
225
- er_p = capm_er(beta_p, rf_ann, erp_ann)
 
226
  cov = cov_ann.reindex(index=tickers, columns=tickers).fillna(0.0).to_numpy()
227
  sigma_p = math.sqrt(float(max(w_expo.T @ cov @ w_expo, 0.0)))
228
  return beta_p, er_p, sigma_p
229
 
230
- # ---------- CML helpers (plot in %) ----------
 
 
 
 
 
 
 
231
  def efficient_same_sigma(sigma_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
232
- if sigma_mkt <= 1e-12:
233
- return 0.0, 1.0, rf_ann
234
  a = sigma_target / sigma_mkt
235
  return a, 1.0 - a, rf_ann + a * erp_ann
236
 
237
  def efficient_same_return(mu_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
238
- if abs(erp_ann) <= 1e-12:
239
- return 0.0, 1.0, rf_ann
240
  a = (mu_target - rf_ann) / erp_ann
241
  return a, 1.0 - a, abs(a) * sigma_mkt
242
 
243
- def plot_cml_percent(
244
- rf_ann, erp_ann, sigma_mkt,
245
- pt_sigma, pt_mu,
246
- same_sigma_sigma, same_sigma_mu,
247
- same_mu_sigma, same_mu_mu,
248
- suggestion_sigma=None, suggestion_mu=None
249
- ) -> Image.Image:
250
- fig = plt.figure(figsize=(6, 4), dpi=120)
251
-
252
- xmax = max(
253
- 0.3,
254
- sigma_mkt * 2.0,
255
- pt_sigma * 1.4,
256
- same_mu_sigma * 1.4,
257
- same_sigma_sigma * 1.4,
258
- (suggestion_sigma or 0.0) * 1.4,
259
- )
260
  xs = np.linspace(0, xmax, 160)
261
- slope = erp_ann / max(sigma_mkt, 1e-12)
262
  cml = rf_ann + slope * xs
263
- plt.plot(xs * 100, cml * 100, label="CML via Market")
264
-
265
- # Points
266
- plt.scatter([0.0], [rf_ann * 100], label="Risk-free (FRED)")
267
- plt.scatter([sigma_mkt * 100], [(rf_ann + erp_ann) * 100], label="Market VOO")
268
- plt.scatter([pt_sigma * 100], [pt_mu * 100], label="Your portfolio")
269
- plt.scatter([same_sigma_sigma * 100], [same_sigma_mu * 100], label="Efficient same sigma")
270
- plt.scatter([same_mu_sigma * 100], [same_mu_mu * 100], label="Efficient same return")
271
- if suggestion_sigma is not None and suggestion_mu is not None:
272
- plt.scatter([suggestion_sigma * 100], [suggestion_mu * 100], label="Suggestion")
273
-
274
- # simple dotted guides
275
- plt.plot([pt_sigma * 100, same_sigma_sigma * 100], [pt_mu * 100, same_sigma_mu * 100], linestyle="--", lw=1, alpha=0.7, color="gray")
276
- plt.plot([pt_sigma * 100, same_mu_sigma * 100], [pt_mu * 100, same_mu_mu * 100], linestyle="--", lw=1, alpha=0.7, color="gray")
277
-
278
- plt.xlabel("σ (annualized, %)")
279
- plt.ylabel("Expected return (annual, %)")
280
- plt.legend(loc="best", fontsize=8)
281
- plt.tight_layout()
282
-
283
- buf = io.BytesIO()
284
- plt.savefig(buf, format="png")
285
- plt.close(fig)
286
- buf.seek(0)
287
  return Image.open(buf)
288
 
289
- # ---------- Synthetic dataset (built only from current universe) ----------
290
  def _row_exposures(row: pd.Series, universe: List[str]) -> Optional[np.ndarray]:
291
  try:
292
  ts = [t.strip() for t in str(row["tickers"]).split(",")]
293
  ws = [float(x) for x in str(row["weights"]).split(",")]
294
  wmap = {t: ws[i] for i, t in enumerate(ts) if i < len(ws)}
295
  w = np.array([wmap.get(t, 0.0) for t in universe], dtype=float)
296
- gross = float(np.sum(np.abs(w)))
297
- if gross <= 1e-12:
298
- return None
299
  return w / gross
300
- except Exception:
301
- return None
302
 
303
  def build_synthetic_dataset(universe: List[str], years: int, rf_ann: float, erp_ann: float, n_rows: int = DATASET_ROWS) -> pd.DataFrame:
304
- # require MARKET_TICKER present for moments; weights exclude it unless random pick includes
305
  moms = estimate_all_moments_aligned(universe, years, rf_ann)
306
  covA, betas = moms["cov_ann"], moms["betas"]
307
 
308
- rng = np.random.default_rng(12345)
309
- rows = []
310
  for i in range(n_rows):
311
  k = int(rng.integers(low=min(2, len(universe)), high=min(8, len(universe)) + 1))
312
  picks = list(rng.choice(universe, size=k, replace=False))
@@ -314,49 +256,34 @@ def build_synthetic_dataset(universe: List[str], years: int, rf_ann: float, erp_
314
  raw = rng.dirichlet(np.ones(k))
315
  gross = 1.0 + float(rng.gamma(2.0, 0.5))
316
  w = gross * signs * raw
317
- # portfolio stats
318
  beta_p, er_p, sigma_p = portfolio_stats({picks[j]: w[j] for j in range(k)}, covA, betas, rf_ann, erp_ann)
319
  rows.append({
320
  "id": i,
321
  "tickers": ",".join(picks),
322
  "weights": ",".join(f"{x:.6f}" for x in w),
323
- "er_p": er_p,
324
- "sigma_p": sigma_p,
325
- "beta_p": beta_p
326
  })
327
  return pd.DataFrame(rows)
328
 
329
  def dataset_path_for_universe(universe: List[str]) -> str:
330
  key = ",".join(sorted(universe))
331
  h = abs(hash(key)) % (10**8)
332
- p = os.path.join(DATA_DIR, f"investor_profiles_{h}.csv")
333
- return p
334
 
335
- # ---------- Suggestion logic (Low / Medium / High) ----------
336
  def _risk_targets(sigmas: np.ndarray) -> Dict[str, float]:
337
- # choose targets by quantiles of dataset sigma
338
- return {
339
- "Low": float(np.quantile(sigmas, 0.15)),
340
- "Medium": float(np.quantile(sigmas, 0.50)),
341
- "High": float(np.quantile(sigmas, 0.85)),
342
- }
343
 
344
  def _describe_row_for_embeddings(row: pd.Series, universe: List[str]) -> str:
345
- # text description for semantic reranking
346
  parts = [f"sigma {row['sigma_p']:.4f}", f"beta {row['beta_p']:.2f}", f"expected return {row['er_p']:.4f}"]
347
  ex = _row_exposures(row, universe)
348
  if ex is not None:
349
  top = sorted([(universe[i], float(abs(ex[i]))) for i in range(len(universe))], key=lambda kv: -kv[1])[:4]
350
- parts.append("focus on " + ", ".join([f"{t}:{w:.2f}" for t, w in top]))
351
  return " ".join(parts)
352
 
353
- def _get_prompt(risk_level: str) -> str:
354
- if risk_level == "Low":
355
- return "low risk, stable, conservative diversified portfolio"
356
- if risk_level == "High":
357
- return "high risk, growth oriented, aggressive portfolio"
358
- return "balanced moderate risk diversified portfolio"
359
-
360
  def _maybe_load_st_model():
361
  global _ST_MODEL
362
  if _ST_MODEL is None:
@@ -364,83 +291,77 @@ def _maybe_load_st_model():
364
  _ST_MODEL = SentenceTransformer("FinLang/finance-embeddings-investopedia")
365
  return _ST_MODEL
366
 
367
- def suggest_from_dataset(csv_path: str,
368
- universe: List[str],
369
- total_amount: float,
370
- risk_level: str,
371
- use_embeddings: bool = False):
372
- try:
373
- df = pd.read_csv(csv_path)
374
- except Exception:
375
- return pd.DataFrame(columns=SUG_COLS_HOLD), None
376
-
377
- if df.empty:
378
- return pd.DataFrame(columns=SUG_COLS_HOLD), None
 
 
379
 
380
  sigmas = df["sigma_p"].to_numpy(dtype=float)
381
- targets = _risk_targets(sigmas)
382
- target_sigma = targets.get(risk_level, targets["Medium"])
383
-
384
- # distance to target sigma
385
- df = df.copy()
386
- df["dist"] = (df["sigma_p"] - target_sigma).abs()
387
 
388
- # Take a reasonable candidate pool
389
  cand = df.nsmallest(100, "dist").reset_index(drop=True)
390
 
391
- # Optional semantic rerank
392
  if use_embeddings:
393
  model = _maybe_load_st_model()
394
- prompt = _get_prompt(risk_level)
395
- texts = [prompt] + [ _describe_row_for_embeddings(r, universe) for _, r in cand.iterrows() ]
396
  embs = model.encode(texts)
397
- S = model.similarity(embs[0:1], embs[1:]).flatten() # cosine similarity
398
  cand = cand.assign(sim=S).sort_values("sim", ascending=False).head(50).reset_index(drop=True)
399
 
400
- # Now pick the top 3 by a combined score (distance, then ER desc)
401
- cand["score"] = cand["dist"] - 0.2 * cand["er_p"] # small bias toward higher ER
402
  picks = cand.nsmallest(3, "score").reset_index(drop=True)
403
 
404
- # Build a simple holdings table: percent and dollars
405
- hold_rows = []
406
- first_pick_mu = None
407
- first_pick_sigma = None
408
  for i, row in picks.iterrows():
409
  expo = _row_exposures(row, universe)
410
- if expo is None:
411
- continue
412
- if first_pick_mu is None:
413
- first_pick_mu = float(row["er_p"])
414
- first_pick_sigma = float(row["sigma_p"])
415
  wmap = {universe[j]: float(expo[j]) for j in range(len(universe)) if abs(float(expo[j])) > 1e-4}
416
- for t, w in sorted(wmap.items(), key=lambda kv: -abs(kv[1]))[:12]:
417
- hold_rows.append({
418
- "pick": i + 1,
419
- "ticker": t,
420
- "weight_%": round(w * 100.0, 2),
421
- "amount_$": round(w * total_amount, 2)
422
- })
423
-
424
- hold_df = pd.DataFrame(hold_rows, columns=SUG_COLS_HOLD)
425
- return hold_df, (first_pick_mu, first_pick_sigma)
426
-
427
- # ---------- UI callbacks ----------
 
 
 
 
 
 
 
 
 
428
  def search_tickers_cb(q: str):
429
  hits = yahoo_search(q)
430
- if not hits:
431
- return "No matches", []
432
  opts = [f"{h['symbol']} | {h['name']} | {h['exchange']}" for h in hits]
433
  return "Select a symbol and click Add", opts
434
 
435
  def add_symbol(selection: str, table: pd.DataFrame):
436
- if not selection:
437
- return table, "Pick a row from Matches first."
438
  symbol = selection.split("|")[0].strip().upper()
439
  current = [] if table is None or len(table) == 0 else [str(x).upper() for x in table["ticker"].tolist() if str(x) != "nan"]
440
  tickers = current if symbol in current else current + [symbol]
441
  val = validate_tickers(tickers, years=DEFAULT_LOOKBACK_YEARS)
442
  tickers = [t for t in tickers if t in val]
443
- # preserve amounts
444
  amt_map = {}
445
  if table is not None and len(table) > 0:
446
  for _, r in table.iterrows():
@@ -450,8 +371,7 @@ def add_symbol(selection: str, table: pd.DataFrame):
450
  new_table = pd.DataFrame({"ticker": tickers, "amount_usd": [amt_map.get(t, 0.0) for t in tickers]})
451
  msg = f"Added {symbol}" if symbol in tickers else f"{symbol} not valid"
452
  if len(new_table) > MAX_TICKERS:
453
- new_table = new_table.iloc[:MAX_TICKERS]
454
- msg = f"Reached max of {MAX_TICKERS}"
455
  return new_table, msg
456
 
457
  def lock_ticker_column(tb: pd.DataFrame):
@@ -464,138 +384,163 @@ def lock_ticker_column(tb: pd.DataFrame):
464
  amounts = amounts[:len(tickers)] + [0.0] * max(0, len(tickers) - len(amounts))
465
  return pd.DataFrame({"ticker": tickers, "amount_usd": amounts})
466
 
467
- # Global horizon & rf on change (persisted during session)
468
  HORIZON_YEARS = 10
469
  RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
470
  RF_ANN = fetch_fred_yield_annual(RF_CODE)
471
 
472
  def set_horizon(years: float):
473
  y = max(1.0, min(100.0, float(years)))
474
- code = fred_series_for_horizon(y)
475
- rf = fetch_fred_yield_annual(code)
476
  global HORIZON_YEARS, RF_CODE, RF_ANN
477
- HORIZON_YEARS = y
478
- RF_CODE = code
479
- RF_ANN = rf
480
  return f"Risk-free series {code}. Latest annual rate {fmt_pct(rf)}. Horizon set to {int(round(y))} years."
481
 
482
  def compute(lookback_years: int,
483
  table: pd.DataFrame,
484
  risk_level: str,
485
  use_embeddings: bool):
486
- # ---- read table
487
  df = table.dropna()
488
  df["ticker"] = df["ticker"].astype(str).str.upper().str.strip()
489
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
490
 
491
  symbols = [t for t in df["ticker"].tolist() if t]
492
  if len(symbols) == 0:
493
- return None, "Add at least one ticker.", "—", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=SUG_COLS_HOLD), None
 
 
494
 
495
  symbols = validate_tickers(symbols, lookback_years)
496
  if len(symbols) == 0:
497
- return None, "Could not validate any tickers.", "—", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=SUG_COLS_HOLD), None
 
 
498
 
499
- universe = list(sorted(set([s for s in symbols if s != MARKET_TICKER] + [MARKET_TICKER])))[:MAX_TICKERS]
500
 
501
  df = df[df["ticker"].isin(symbols)].copy()
502
  amounts = {r["ticker"]: float(r["amount_usd"]) for _, r in df.iterrows()}
503
  total_amt = float(sum(abs(v) for v in amounts.values()))
504
  if total_amt <= 1e-12:
505
- return None, "All amounts are zero.", f"Universe set to {', '.join(universe)}", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=SUG_COLS_HOLD), None
 
 
 
506
  weights = {k: v / total_amt for k, v in amounts.items()}
507
 
508
- # ---- moments & portfolio metrics
509
  moms = estimate_all_moments_aligned(universe, lookback_years, RF_ANN)
510
- betas, covA, erp_ann, sigma_mkt = moms["betas"], moms["cov_ann"], moms["erp_ann"], moms["sigma_m_ann"]
511
- beta_p, er_p, sigma_p = portfolio_stats(weights, covA, betas, RF_ANN, erp_ann)
 
 
 
512
 
513
  a_sigma, b_sigma, mu_eff_sigma = efficient_same_sigma(sigma_p, RF_ANN, erp_ann, sigma_mkt)
514
- a_mu, b_mu, sigma_eff_mu = efficient_same_return(er_p, RF_ANN, erp_ann, sigma_mkt)
515
 
516
- # ---- dataset build (only for current universe)
517
  csv_path = dataset_path_for_universe(universe)
518
  if not os.path.exists(csv_path):
519
  synth = build_synthetic_dataset(universe, lookback_years, RF_ANN, erp_ann, n_rows=DATASET_ROWS)
520
- ensure_dir(csv_path)
521
- synth.to_csv(csv_path, index=False)
522
-
523
- # ---- dataset-based suggestions (simple table: percent & dollars)
524
- hold_df, first_pick_pt = suggest_from_dataset(csv_path, universe, total_amt, risk_level, use_embeddings)
525
- sug_mu, sug_sigma = (first_pick_pt if first_pick_pt is not None else (None, None))
526
-
527
- # ---- plot
528
- img = plot_cml_percent(
529
- RF_ANN, erp_ann, sigma_mkt,
530
- sigma_p, er_p,
531
- sigma_p, mu_eff_sigma,
532
- sigma_eff_mu, er_p,
533
- suggestion_sigma=sug_sigma, suggestion_mu=sug_mu
534
  )
535
 
536
- # ---- summary (percent everywhere)
 
 
 
 
 
 
 
 
 
 
537
  info_lines = []
538
- info_lines.append("### Inputs")
539
- info_lines.append(f"- Lookback years {int(lookback_years)}")
540
- info_lines.append(f"- Horizon years {int(round(HORIZON_YEARS))}")
541
- info_lines.append(f"- Risk-free {fmt_pct(RF_ANN)} from {RF_CODE}")
542
- info_lines.append(f"- Market ERP {fmt_pct(erp_ann)}")
543
- info_lines.append(f"- Market σ {fmt_pct(sigma_mkt)}")
544
- info_lines.append("")
545
- info_lines.append("### Your portfolio (CAPM expectations)")
546
- info_lines.append(f"- Beta {beta_p:.2f}")
547
- info_lines.append(f"- σ {fmt_pct(sigma_p)}")
548
- info_lines.append(f"- Expected return {fmt_pct(er_p)}")
549
- info_lines.append("")
550
- info_lines.append("### Efficient alternatives on CML")
551
- info_lines.append(f"- Same σ as your portfolio → Market weight {a_sigma:.2f}, Bills weight {b_sigma:.2f}, return {fmt_pct(mu_eff_sigma)}")
552
- info_lines.append(f"- Same expected return → Market weight {a_mu:.2f}, Bills weight {b_mu:.2f}, σ {fmt_pct(sigma_eff_mu)}")
553
- info_lines.append("")
554
- info_lines.append(f"### Dataset-based suggestions (risk: {risk_level})")
555
- info_lines.append("- Shown below as simple holdings: percent of exposure and dollars allocated.")
 
 
 
556
  if use_embeddings:
557
  info_lines.append("- Reranked with finance embeddings (FinLang/finance-embeddings-investopedia).")
558
-
559
  info = "\n".join(info_lines)
560
 
561
- # ---- positions table for current inputs
562
  rows = []
563
  for t in symbols:
564
- beta_val = 1.0 if t == MARKET_TICKER else betas.get(t, np.nan)
565
  rows.append({
566
  "ticker": t,
567
  "amount_usd": round(amounts.get(t, 0.0), 2),
568
  "weight_exposure": round(weights.get(t, 0.0), 6),
569
- "beta": round(beta_val, 6),
570
  })
571
  pos_table = pd.DataFrame(rows, columns=POS_COLS)
572
 
573
  uni_msg = f"Universe set to: {', '.join(universe)}"
574
- return img, info, uni_msg, pos_table, hold_df, csv_path
575
-
576
- # ---------- UI ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
  with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
578
  with gr.Accordion("About (assignment section 1)", open=False):
579
  gr.Markdown(
580
  "**Modality**: Text.\n\n"
581
- "**Use case**: Given a user’s stock/ETF universe and current dollar amounts, the system recommends three "
582
- "alternative mixes (Low / Medium / High risk) generated from a 1,000-row dataset of random portfolios built "
583
- "only from the user’s current universe.\n\n"
584
- "**System goal**: User provides text inputs (tickers and amounts). The system returns three similar items "
585
- "(suggested mixes) from the dataset. Optional reranking uses the text-embedding model "
586
- "`FinLang/finance-embeddings-investopedia`."
587
  )
588
 
589
  gr.Markdown(
590
  "## Efficient Portfolio Advisor\n"
591
- "Search symbols, enter dollar amounts, set your horizon. Prices from Yahoo Finance. "
592
- "Risk-free from FRED. Low/Medium/High suggestions are chosen only from a 1,000-row dataset generated from your current universe, "
593
- "optionally refined with finance embeddings."
594
  )
595
 
596
  with gr.Row():
597
  with gr.Column(scale=1):
598
- # search
599
  q = gr.Textbox(label="Search symbol")
600
  search_note = gr.Markdown(" ")
601
  matches = gr.Dropdown(choices=[], label="Matches")
@@ -603,25 +548,20 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
603
  search_btn = gr.Button("Search")
604
  add_btn = gr.Button("Add selected to portfolio")
605
 
606
- # portfolio table
607
  gr.Markdown("### Portfolio positions — type dollar amounts (negatives allowed for shorts)")
608
  table = gr.Dataframe(
609
  headers=["ticker", "amount_usd"],
610
  datatype=["str", "number"],
611
- row_count=0,
612
- col_count=(2, "fixed"),
613
  value=pd.DataFrame(columns=["ticker", "amount_usd"])
614
  )
615
 
616
- # horizon & lookback
617
  horizon = gr.Number(label="Horizon in years (1–100)", value=HORIZON_YEARS, precision=0)
618
  lookback = gr.Slider(1, 10, value=DEFAULT_LOOKBACK_YEARS, step=1, label="Lookback years for beta & sigma")
619
 
620
- # suggestions controls
621
  gr.Markdown("### Suggestions")
622
  risk = gr.Radio(["Low", "Medium", "High"], value="Medium", label="Risk tolerance")
623
  use_st = gr.Checkbox(label="Use finance embeddings to refine picks", value=False)
624
-
625
  run_btn = gr.Button("Compute (build dataset & suggest)")
626
 
627
  with gr.Column(scale=1):
@@ -636,8 +576,12 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
636
  value=pd.DataFrame(columns=POS_COLS),
637
  interactive=False
638
  )
639
- suggestions = gr.Dataframe(
640
- label="Dataset-based suggestions (top 3 — holdings shown as % and $)",
 
 
 
 
641
  headers=SUG_COLS_HOLD,
642
  datatype=["number", "str", "number", "number"],
643
  col_count=(len(SUG_COLS_HOLD), "fixed"),
@@ -646,11 +590,12 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
646
  )
647
  dl = gr.File(label="Generated dataset CSV", value=None, visible=True)
648
 
649
- # wiring
650
- def do_search(query):
651
- note, options = search_tickers_cb(query)
652
- return note, gr.update(choices=options)
653
 
 
 
654
  search_btn.click(fn=do_search, inputs=q, outputs=[search_note, matches])
655
  add_btn.click(fn=add_symbol, inputs=[matches, table], outputs=[table, search_note])
656
  table.change(fn=lock_ticker_column, inputs=table, outputs=table)
@@ -659,7 +604,13 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
659
  run_btn.click(
660
  fn=compute,
661
  inputs=[lookback, table, risk, use_st],
662
- outputs=[plot, summary, universe_msg, positions, suggestions, dl]
 
 
 
 
 
 
663
  )
664
 
665
  if __name__ == "__main__":
 
1
  # app.py
2
+ # Efficient Portfolio Advisor — CML-consistent plotting + suggestion picker
3
+ # Modality: Text. Optional reranking model: FinLang/finance-embeddings-investopedia
 
4
 
5
  import os
6
  import io
 
15
  import pandas as pd
16
  import matplotlib.pyplot as plt
17
  from PIL import Image
 
18
  import gradio as gr
19
+ import requests
20
  import yfinance as yf
21
 
22
+ _ST_MODEL = None # lazy load for embeddings
 
 
 
 
 
23
 
24
+ # ---------------- Config ----------------
25
+ DATA_DIR = "data"; os.makedirs(DATA_DIR, exist_ok=True)
26
  MARKET_TICKER = "VOO"
27
  MAX_TICKERS = 30
28
  DEFAULT_LOOKBACK_YEARS = 10
29
  DATASET_ROWS = 1000
30
 
 
31
  FRED_MAP = [
32
+ (1, "DGS1"), (2, "DGS2"), (3, "DGS3"), (5, "DGS5"),
33
+ (7, "DGS7"), (10, "DGS10"), (20, "DGS20"), (30, "DGS30"), (100, "DGS30")
 
 
 
 
 
 
 
34
  ]
35
 
36
  POS_COLS = ["ticker", "amount_usd", "weight_exposure", "beta"]
37
  SUG_COLS_HOLD = ["pick", "ticker", "weight_%", "amount_$"]
38
 
39
+ # ---------------- Small helpers ----------------
40
  def fmt_pct(x: float, dec: int = 2) -> str:
41
+ try: return f"{x*100:.{dec}f}%"
42
+ except: return ""
 
 
 
 
 
 
 
 
43
 
44
  def ensure_dir(p: str):
45
  os.makedirs(os.path.dirname(p), exist_ok=True)
 
47
  def fred_series_for_horizon(years: float) -> str:
48
  y = max(1.0, min(100.0, float(years)))
49
  for cutoff, code in FRED_MAP:
50
+ if y <= cutoff: return code
 
51
  return "DGS30"
52
 
53
  def fetch_fred_yield_annual(code: str) -> float:
54
  url = f"https://fred.stlouisfed.org/graph/fredgraph.csv?id={code}"
55
  try:
56
+ r = requests.get(url, timeout=10); r.raise_for_status()
 
57
  df = pd.read_csv(io.StringIO(r.text))
58
  s = pd.to_numeric(df.iloc[:, 1], errors="coerce").dropna()
59
  return float(s.iloc[-1] / 100.0) if len(s) else 0.03
60
+ except: return 0.03
 
61
 
62
+ # ---------------- Prices & returns ----------------
63
  def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
64
  start = pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=years, days=7)
65
  end = pd.Timestamp.today(tz="UTC")
 
66
  raw = yf.download(
67
  list(dict.fromkeys(tickers)),
68
+ start=start.date(), end=end.date(),
69
+ interval="1mo", auto_adjust=False, progress=False,
70
+ group_by="ticker", threads=False
 
 
 
 
71
  )
72
  if raw is None or len(raw) == 0:
73
  return pd.DataFrame()
74
 
 
75
  if isinstance(raw.columns, pd.MultiIndex):
76
  price = None
77
  for field in ("Adj Close", "Close"):
78
  if field in raw.columns.get_level_values(-1):
79
+ price = raw.xs(field, axis=1, level=-1, drop_level=True); break
 
80
  if price is None:
81
  price = raw.copy()
82
  price.columns = [c[0] if isinstance(c, tuple) else c for c in price.columns]
83
  else:
84
+ if "Adj Close" in raw.columns: price = raw["Adj Close"]
85
+ elif "Close" in raw.columns: price = raw["Close"]
86
+ else: price = raw
 
 
 
 
 
 
87
 
88
+ if isinstance(price, pd.Series): price = price.to_frame()
89
  price = price.dropna(how="all").fillna(method="ffill")
90
  price = price.loc[:, ~pd.Index(price.columns).duplicated()]
91
  return price
 
93
  def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
94
  return prices.pct_change().dropna()
95
 
96
+ def annualize_mean(m): return np.asarray(m, dtype=float) * 12.0
97
+ def annualize_sigma(s): return np.asarray(s, dtype=float) * math.sqrt(12.0)
 
 
 
98
 
99
+ # ---------------- Search & validation ----------------
100
  def yahoo_search(query: str):
101
+ if not query or not query.strip(): return []
 
102
  url = "https://query1.finance.yahoo.com/v1/finance/search"
103
  params = {"q": query.strip(), "quotesCount": 10, "newsCount": 0}
104
  headers = {"User-Agent": "Mozilla/5.0"}
105
  try:
106
  r = requests.get(url, params=params, headers=headers, timeout=10)
107
+ r.raise_for_status(); data = r.json()
 
108
  out = []
109
  for q in data.get("quotes", []):
110
  sym = q.get("symbol")
 
115
  if not out:
116
  out = [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "—"}]
117
  return out[:10]
118
+ except:
119
  return [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "—"}]
120
 
121
  def validate_tickers(symbols: List[str], years: int) -> List[str]:
 
122
  base = list(dict.fromkeys([s.strip().upper() for s in symbols if s.strip()]))[:MAX_TICKERS]
123
  px = fetch_prices_monthly(base + [MARKET_TICKER], years)
124
  ok = [s for s in base if s in px.columns]
125
  return ok
126
 
127
+ # ---------------- Aligned CAPM moments (now includes MARKET in cov & μ) ----------------
128
  def get_aligned_monthly_returns(symbols: List[str], years: int) -> pd.DataFrame:
129
+ uniq = [c for c in dict.fromkeys(symbols)]
130
+ px = fetch_prices_monthly(uniq, years)
 
131
  rets = monthly_returns(px)
132
+ cols = [c for c in uniq if c in rets.columns]
133
  R = rets[cols].dropna(how="any")
134
  return R.loc[:, ~R.columns.duplicated()]
135
 
136
  def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
137
+ R = get_aligned_monthly_returns(symbols + [MARKET_TICKER], years)
138
  if MARKET_TICKER not in R.columns or R.shape[0] < 3:
139
  raise ValueError("Not enough aligned data to estimate moments.")
140
  rf_m = rf_ann / 12.0
141
 
142
+ # Means
143
+ mu_m = R[MARKET_TICKER]; mu_m_ann = float(annualize_mean(mu_m.mean()))
144
+ mu_all_ann = annualize_mean(R.mean(axis=0)) # pandas Series across all cols
145
+ sigma_m_ann = float(annualize_sigma(mu_m.std(ddof=1)))
 
 
146
  erp_ann = float(mu_m_ann - rf_ann)
147
 
148
+ # Betas vs market
149
+ ex_m = mu_m - rf_m
150
+ var_m = float(np.var(ex_m.values, ddof=1)); var_m = max(var_m, 1e-6)
 
151
  betas: Dict[str, float] = {}
152
+ for s in R.columns:
153
  ex_s = R[s] - rf_m
154
  betas[s] = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1] / var_m)
155
  betas[MARKET_TICKER] = 1.0
156
 
157
+ # Covariance includes MARKET_TICKER too
158
+ cov_m = np.cov(R.values.T, ddof=1)
159
+ covA = pd.DataFrame(cov_m * 12.0, index=R.columns, columns=R.columns)
160
 
161
+ return {
162
+ "betas": betas,
163
+ "cov_ann": covA,
164
+ "erp_ann": erp_ann,
165
+ "sigma_m_ann": sigma_m_ann,
166
+ "mu_all_ann": pd.Series(mu_all_ann, index=R.columns) # annualized means per asset incl. market
167
+ }
168
 
169
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
170
  return float(rf_ann + beta * erp_ann)
 
175
  rf_ann: float,
176
  erp_ann: float) -> Tuple[float, float, float]:
177
  tickers = list(weights.keys())
178
+ if len(tickers) == 0: return 0.0, 0.0, 0.0
 
179
  w = np.array([weights[t] for t in tickers], dtype=float)
180
+ gross = float(np.sum(np.abs(w))); w_expo = w / max(gross, 1e-12)
181
+
 
 
182
  beta_p = float(np.dot([betas.get(t, 0.0) for t in tickers], w_expo))
183
+ er_p = capm_er(beta_p, rf_ann, erp_ann)
184
+
185
  cov = cov_ann.reindex(index=tickers, columns=tickers).fillna(0.0).to_numpy()
186
  sigma_p = math.sqrt(float(max(w_expo.T @ cov @ w_expo, 0.0)))
187
  return beta_p, er_p, sigma_p
188
 
189
+ def portfolio_hist_return(weights: Dict[str, float], mu_all_ann: pd.Series) -> float:
190
+ tickers = list(weights.keys())
191
+ w = np.array([weights[t] for t in tickers], dtype=float)
192
+ gross = float(np.sum(np.abs(w))); w_expo = w / max(gross, 1e-12)
193
+ mu = mu_all_ann.reindex(tickers).fillna(0.0).to_numpy()
194
+ return float(np.dot(mu, w_expo))
195
+
196
+ # ---------------- CML plot (percent axes) ----------------
197
  def efficient_same_sigma(sigma_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
198
+ if sigma_mkt <= 1e-12: return 0.0, 1.0, rf_ann
 
199
  a = sigma_target / sigma_mkt
200
  return a, 1.0 - a, rf_ann + a * erp_ann
201
 
202
  def efficient_same_return(mu_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
203
+ if abs(erp_ann) <= 1e-12: return 0.0, 1.0, rf_ann
 
204
  a = (mu_target - rf_ann) / erp_ann
205
  return a, 1.0 - a, abs(a) * sigma_mkt
206
 
207
+ def plot_cml_percent(base, suggestion=None) -> Image.Image:
208
+ rf_ann = base["rf"]; erp = base["erp"]; sig_m = base["sigma_m"]
209
+ pt_s = base["pt_sigma"]; pt_mu = base["pt_mu"]
210
+ sames_s_s = base["same_sigma_sigma"]; sames_s_mu = base["same_sigma_mu"]
211
+ same_mu_s = base["same_mu_sigma"]; same_mu_mu = base["same_mu_mu"]
212
+
213
+ fig = plt.figure(figsize=(6,4), dpi=120)
214
+ xmax = max(0.3, sig_m*2.0, pt_s*1.4, same_mu_s*1.4, sames_s_s*1.4, (suggestion["sigma"] if suggestion else 0.0)*1.4)
 
 
 
 
 
 
 
 
 
215
  xs = np.linspace(0, xmax, 160)
216
+ slope = erp / max(sig_m, 1e-12)
217
  cml = rf_ann + slope * xs
218
+ plt.plot(xs*100, cml*100, label="CML via Market")
219
+
220
+ plt.scatter([0.0], [rf_ann*100], label="Risk-free (FRED)")
221
+ plt.scatter([sig_m*100], [(rf_ann+erp)*100], label="Market VOO")
222
+ plt.scatter([pt_s*100], [pt_mu*100], label="Your portfolio")
223
+ plt.scatter([sames_s_s*100], [sames_s_mu*100], label="Efficient same σ")
224
+ plt.scatter([same_mu_s*100], [same_mu_mu*100], label="Efficient same return")
225
+
226
+ if suggestion:
227
+ plt.scatter([suggestion["sigma"]*100], [suggestion["mu"]*100], label="Suggestion")
228
+
229
+ plt.xlabel("σ (annualized, %)"); plt.ylabel("Expected return (annual, %)")
230
+ plt.legend(loc="best", fontsize=8); plt.tight_layout()
231
+
232
+ buf = io.BytesIO(); plt.savefig(buf, format="png"); plt.close(fig); buf.seek(0)
 
 
 
 
 
 
 
 
 
233
  return Image.open(buf)
234
 
235
+ # ---------------- Synthetic dataset (universe only) ----------------
236
  def _row_exposures(row: pd.Series, universe: List[str]) -> Optional[np.ndarray]:
237
  try:
238
  ts = [t.strip() for t in str(row["tickers"]).split(",")]
239
  ws = [float(x) for x in str(row["weights"]).split(",")]
240
  wmap = {t: ws[i] for i, t in enumerate(ts) if i < len(ws)}
241
  w = np.array([wmap.get(t, 0.0) for t in universe], dtype=float)
242
+ gross = float(np.sum(np.abs(w)));
243
+ if gross <= 1e-12: return None
 
244
  return w / gross
245
+ except: return None
 
246
 
247
  def build_synthetic_dataset(universe: List[str], years: int, rf_ann: float, erp_ann: float, n_rows: int = DATASET_ROWS) -> pd.DataFrame:
 
248
  moms = estimate_all_moments_aligned(universe, years, rf_ann)
249
  covA, betas = moms["cov_ann"], moms["betas"]
250
 
251
+ rng = np.random.default_rng(12345); rows = []
 
252
  for i in range(n_rows):
253
  k = int(rng.integers(low=min(2, len(universe)), high=min(8, len(universe)) + 1))
254
  picks = list(rng.choice(universe, size=k, replace=False))
 
256
  raw = rng.dirichlet(np.ones(k))
257
  gross = 1.0 + float(rng.gamma(2.0, 0.5))
258
  w = gross * signs * raw
 
259
  beta_p, er_p, sigma_p = portfolio_stats({picks[j]: w[j] for j in range(k)}, covA, betas, rf_ann, erp_ann)
260
  rows.append({
261
  "id": i,
262
  "tickers": ",".join(picks),
263
  "weights": ",".join(f"{x:.6f}" for x in w),
264
+ "er_p": er_p, "sigma_p": sigma_p, "beta_p": beta_p
 
 
265
  })
266
  return pd.DataFrame(rows)
267
 
268
  def dataset_path_for_universe(universe: List[str]) -> str:
269
  key = ",".join(sorted(universe))
270
  h = abs(hash(key)) % (10**8)
271
+ return os.path.join(DATA_DIR, f"investor_profiles_{h}.csv")
 
272
 
273
+ # ---------------- Suggestions (build + picker) ----------------
274
  def _risk_targets(sigmas: np.ndarray) -> Dict[str, float]:
275
+ return {"Low": float(np.quantile(sigmas, 0.15)),
276
+ "Medium": float(np.quantile(sigmas, 0.50)),
277
+ "High": float(np.quantile(sigmas, 0.85))}
 
 
 
278
 
279
  def _describe_row_for_embeddings(row: pd.Series, universe: List[str]) -> str:
 
280
  parts = [f"sigma {row['sigma_p']:.4f}", f"beta {row['beta_p']:.2f}", f"expected return {row['er_p']:.4f}"]
281
  ex = _row_exposures(row, universe)
282
  if ex is not None:
283
  top = sorted([(universe[i], float(abs(ex[i]))) for i in range(len(universe))], key=lambda kv: -kv[1])[:4]
284
+ parts.append("focus " + ", ".join([f"{t}:{w:.2f}" for t, w in top]))
285
  return " ".join(parts)
286
 
 
 
 
 
 
 
 
287
  def _maybe_load_st_model():
288
  global _ST_MODEL
289
  if _ST_MODEL is None:
 
291
  _ST_MODEL = SentenceTransformer("FinLang/finance-embeddings-investopedia")
292
  return _ST_MODEL
293
 
294
+ def build_suggestions(csv_path: str,
295
+ universe: List[str],
296
+ total_amount: float,
297
+ risk_level: str,
298
+ use_embeddings: bool,
299
+ covA: pd.DataFrame,
300
+ betas: Dict[str, float],
301
+ rf_ann: float,
302
+ erp_ann: float,
303
+ mu_all_ann: pd.Series):
304
+ try: df = pd.read_csv(csv_path)
305
+ except Exception: return [], pd.DataFrame(columns=SUG_COLS_HOLD)
306
+
307
+ if df.empty: return [], pd.DataFrame(columns=SUG_COLS_HOLD)
308
 
309
  sigmas = df["sigma_p"].to_numpy(dtype=float)
310
+ target_sigma = _risk_targets(sigmas).get(risk_level, float(np.median(sigmas)))
 
 
 
 
 
311
 
312
+ df = df.copy(); df["dist"] = (df["sigma_p"] - target_sigma).abs()
313
  cand = df.nsmallest(100, "dist").reset_index(drop=True)
314
 
 
315
  if use_embeddings:
316
  model = _maybe_load_st_model()
317
+ prompt = {"Low":"low risk conservative mix","Medium":"balanced moderate risk","High":"aggressive growth high risk"}[risk_level]
318
+ texts = [prompt] + [_describe_row_for_embeddings(r, universe) for _, r in cand.iterrows()]
319
  embs = model.encode(texts)
320
+ S = model.similarity(embs[0:1], embs[1:]).flatten()
321
  cand = cand.assign(sim=S).sort_values("sim", ascending=False).head(50).reset_index(drop=True)
322
 
323
+ cand["score"] = cand["dist"] - 0.2*cand["er_p"]
 
324
  picks = cand.nsmallest(3, "score").reset_index(drop=True)
325
 
326
+ suggestions = []
 
 
 
327
  for i, row in picks.iterrows():
328
  expo = _row_exposures(row, universe)
329
+ if expo is None: continue
 
 
 
 
330
  wmap = {universe[j]: float(expo[j]) for j in range(len(universe)) if abs(float(expo[j])) > 1e-4}
331
+ # recompute metrics using current moments (historical μ for plotting)
332
+ beta_s, er_capm_s, sigma_s = portfolio_stats(wmap, covA, betas, rf_ann, erp_ann)
333
+ mu_hist_s = portfolio_hist_return(wmap, mu_all_ann)
334
+ # holdings table for this pick
335
+ rows_hold = [{
336
+ "pick": i+1,
337
+ "ticker": t,
338
+ "weight_%": round(w*100.0, 2),
339
+ "amount_$": round(w*total_amount, 2)
340
+ } for t, w in sorted(wmap.items(), key=lambda kv: -abs(kv[1]))]
341
+ suggestions.append({
342
+ "pick": i+1,
343
+ "hold_df": pd.DataFrame(rows_hold, columns=SUG_COLS_HOLD),
344
+ "mu_hist": mu_hist_s, "sigma_hist": sigma_s,
345
+ "beta": beta_s, "er_capm": er_capm_s
346
+ })
347
+
348
+ first_table = suggestions[0]["hold_df"] if suggestions else pd.DataFrame(columns=SUG_COLS_HOLD)
349
+ return suggestions, first_table
350
+
351
+ # ---------------- UI callbacks ----------------
352
  def search_tickers_cb(q: str):
353
  hits = yahoo_search(q)
354
+ if not hits: return "No matches", []
 
355
  opts = [f"{h['symbol']} | {h['name']} | {h['exchange']}" for h in hits]
356
  return "Select a symbol and click Add", opts
357
 
358
  def add_symbol(selection: str, table: pd.DataFrame):
359
+ if not selection: return table, "Pick a row from Matches first."
 
360
  symbol = selection.split("|")[0].strip().upper()
361
  current = [] if table is None or len(table) == 0 else [str(x).upper() for x in table["ticker"].tolist() if str(x) != "nan"]
362
  tickers = current if symbol in current else current + [symbol]
363
  val = validate_tickers(tickers, years=DEFAULT_LOOKBACK_YEARS)
364
  tickers = [t for t in tickers if t in val]
 
365
  amt_map = {}
366
  if table is not None and len(table) > 0:
367
  for _, r in table.iterrows():
 
371
  new_table = pd.DataFrame({"ticker": tickers, "amount_usd": [amt_map.get(t, 0.0) for t in tickers]})
372
  msg = f"Added {symbol}" if symbol in tickers else f"{symbol} not valid"
373
  if len(new_table) > MAX_TICKERS:
374
+ new_table = new_table.iloc[:MAX_TICKERS]; msg = f"Reached max of {MAX_TICKERS}"
 
375
  return new_table, msg
376
 
377
  def lock_ticker_column(tb: pd.DataFrame):
 
384
  amounts = amounts[:len(tickers)] + [0.0] * max(0, len(tickers) - len(amounts))
385
  return pd.DataFrame({"ticker": tickers, "amount_usd": amounts})
386
 
 
387
  HORIZON_YEARS = 10
388
  RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
389
  RF_ANN = fetch_fred_yield_annual(RF_CODE)
390
 
391
  def set_horizon(years: float):
392
  y = max(1.0, min(100.0, float(years)))
393
+ code = fred_series_for_horizon(y); rf = fetch_fred_yield_annual(code)
 
394
  global HORIZON_YEARS, RF_CODE, RF_ANN
395
+ HORIZON_YEARS = y; RF_CODE = code; RF_ANN = rf
 
 
396
  return f"Risk-free series {code}. Latest annual rate {fmt_pct(rf)}. Horizon set to {int(round(y))} years."
397
 
398
  def compute(lookback_years: int,
399
  table: pd.DataFrame,
400
  risk_level: str,
401
  use_embeddings: bool):
 
402
  df = table.dropna()
403
  df["ticker"] = df["ticker"].astype(str).str.upper().str.strip()
404
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
405
 
406
  symbols = [t for t in df["ticker"].tolist() if t]
407
  if len(symbols) == 0:
408
+ empty_hold = pd.DataFrame(columns=SUG_COLS_HOLD)
409
+ empty_pos = pd.DataFrame(columns=POS_COLS)
410
+ return None, "Add at least one ticker.", "—", empty_pos, empty_hold, None, [], {}
411
 
412
  symbols = validate_tickers(symbols, lookback_years)
413
  if len(symbols) == 0:
414
+ empty_hold = pd.DataFrame(columns=SUG_COLS_HOLD)
415
+ empty_pos = pd.DataFrame(columns=POS_COLS)
416
+ return None, "Could not validate any tickers.", "—", empty_pos, empty_hold, None, [], {}
417
 
418
+ universe = list(sorted(set(symbols + [MARKET_TICKER])))[:MAX_TICKERS]
419
 
420
  df = df[df["ticker"].isin(symbols)].copy()
421
  amounts = {r["ticker"]: float(r["amount_usd"]) for _, r in df.iterrows()}
422
  total_amt = float(sum(abs(v) for v in amounts.values()))
423
  if total_amt <= 1e-12:
424
+ empty_hold = pd.DataFrame(columns=SUG_COLS_HOLD)
425
+ empty_pos = pd.DataFrame(columns=POS_COLS)
426
+ return None, "All amounts are zero.", f"Universe set to {', '.join(universe)}", empty_pos, empty_hold, None, [], {}
427
+
428
  weights = {k: v / total_amt for k, v in amounts.items()}
429
 
 
430
  moms = estimate_all_moments_aligned(universe, lookback_years, RF_ANN)
431
+ betas, covA, erp_ann = moms["betas"], moms["cov_ann"], moms["erp_ann"]
432
+ sigma_mkt, mu_all_ann = moms["sigma_m_ann"], moms["mu_all_ann"]
433
+
434
+ beta_p, er_capm_p, sigma_p = portfolio_stats(weights, covA, betas, RF_ANN, erp_ann)
435
+ mu_hist_p = portfolio_hist_return(weights, mu_all_ann) # use this for plotting
436
 
437
  a_sigma, b_sigma, mu_eff_sigma = efficient_same_sigma(sigma_p, RF_ANN, erp_ann, sigma_mkt)
438
+ a_mu, b_mu, sigma_eff_mu = efficient_same_return(mu_hist_p, RF_ANN, erp_ann, sigma_mkt)
439
 
440
+ # dataset for this universe
441
  csv_path = dataset_path_for_universe(universe)
442
  if not os.path.exists(csv_path):
443
  synth = build_synthetic_dataset(universe, lookback_years, RF_ANN, erp_ann, n_rows=DATASET_ROWS)
444
+ ensure_dir(csv_path); synth.to_csv(csv_path, index=False)
445
+
446
+ # suggestions list + first table
447
+ suggestions, first_table = build_suggestions(
448
+ csv_path, universe, total_amt, risk_level, use_embeddings,
449
+ covA, betas, RF_ANN, erp_ann, mu_all_ann
 
 
 
 
 
 
 
 
450
  )
451
 
452
+ # plot state + initial image with first suggestion overlay
453
+ plot_state = {
454
+ "rf": RF_ANN, "erp": erp_ann, "sigma_m": sigma_mkt,
455
+ "pt_sigma": sigma_p, "pt_mu": mu_hist_p,
456
+ "same_sigma_sigma": sigma_p, "same_sigma_mu": mu_eff_sigma,
457
+ "same_mu_sigma": sigma_eff_mu, "same_mu_mu": mu_hist_p
458
+ }
459
+ sug_overlay = {"sigma": suggestions[0]["sigma_hist"], "mu": suggestions[0]["mu_hist"]} if suggestions else None
460
+ img = plot_cml_percent(plot_state, suggestion=sug_overlay)
461
+
462
+ # summary text (show both CAPM and historical for your portfolio)
463
  info_lines = []
464
+ info_lines += [
465
+ "### Inputs",
466
+ f"- Lookback years {int(lookback_years)}",
467
+ f"- Horizon years {int(round(HORIZON_YEARS))}",
468
+ f"- Risk-free {fmt_pct(RF_ANN)} from {RF_CODE}",
469
+ f"- Market ERP {fmt_pct(erp_ann)}",
470
+ f"- Market σ {fmt_pct(sigma_mkt)}",
471
+ "",
472
+ "### Your portfolio",
473
+ f"- Beta {beta_p:.2f}",
474
+ f"- σ (historical) {fmt_pct(sigma_p)}",
475
+ f"- Expected return (historical) {fmt_pct(mu_hist_p)}",
476
+ f"- Expected return (CAPM / SML) {fmt_pct(er_capm_p)}",
477
+ "",
478
+ "### Efficient alternatives on CML",
479
+ f"- Same σ as your portfolio → Market {a_sigma:.2f}, Bills {b_sigma:.2f}, return {fmt_pct(mu_eff_sigma)}",
480
+ f"- Same return (historical) → Market {a_mu:.2f}, Bills {b_mu:.2f}, σ {fmt_pct(sigma_eff_mu)}",
481
+ "",
482
+ f"### Dataset-based suggestions (risk: {risk_level})",
483
+ "- Use the selector below to flip between Pick #1 / #2 / #3. Table shows % exposure and $ amounts."
484
+ ]
485
  if use_embeddings:
486
  info_lines.append("- Reranked with finance embeddings (FinLang/finance-embeddings-investopedia).")
 
487
  info = "\n".join(info_lines)
488
 
489
+ # positions table
490
  rows = []
491
  for t in symbols:
 
492
  rows.append({
493
  "ticker": t,
494
  "amount_usd": round(amounts.get(t, 0.0), 2),
495
  "weight_exposure": round(weights.get(t, 0.0), 6),
496
+ "beta": round(betas.get(t, np.nan), 6),
497
  })
498
  pos_table = pd.DataFrame(rows, columns=POS_COLS)
499
 
500
  uni_msg = f"Universe set to: {', '.join(universe)}"
501
+ # also return a short pick-info for pick #1
502
+ pick_info = ""
503
+ if suggestions:
504
+ s = suggestions[0]
505
+ pick_info = (f"**Pick #1** — σ {fmt_pct(s['sigma_hist'])}, "
506
+ f"ER (hist) {fmt_pct(s['mu_hist'])}, "
507
+ f"ER (CAPM) {fmt_pct(s['er_capm'])}, beta {s['beta']:.2f}")
508
+
509
+ return img, info, uni_msg, pos_table, first_table, csv_path, suggestions, plot_state, pick_info
510
+
511
+ def change_pick(idx: int, suggestions, plot_state):
512
+ # idx is 1..3
513
+ if not suggestions or idx is None:
514
+ return pd.DataFrame(columns=SUG_COLS_HOLD), plot_cml_percent(plot_state), ""
515
+ i = int(idx) - 1
516
+ if i < 0 or i >= len(suggestions):
517
+ i = 0
518
+ s = suggestions[i]
519
+ img = plot_cml_percent(plot_state, suggestion={"sigma": s["sigma_hist"], "mu": s["mu_hist"]})
520
+ pick_info = (f"**Pick #{idx}** — σ {fmt_pct(s['sigma_hist'])}, "
521
+ f"ER (hist) {fmt_pct(s['mu_hist'])}, "
522
+ f"ER (CAPM) {fmt_pct(s['er_capm'])}, beta {s['beta']:.2f}")
523
+ return s["hold_df"], img, pick_info
524
+
525
+ # ---------------- UI ----------------
526
  with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
527
  with gr.Accordion("About (assignment section 1)", open=False):
528
  gr.Markdown(
529
  "**Modality**: Text.\n\n"
530
+ "**Use case**: Given a user’s stock/ETF universe and dollar amounts, the system recommends three "
531
+ "alternative mixes (Low / Medium / High risk) drawn from a 1,000-row dataset generated from the user’s current universe.\n\n"
532
+ "**System goal**: User inputs text (tickers & amounts). System returns three similar items (suggested mixes) from the dataset. "
533
+ "Optional reranking uses the text-embedding model `FinLang/finance-embeddings-investopedia`."
 
 
534
  )
535
 
536
  gr.Markdown(
537
  "## Efficient Portfolio Advisor\n"
538
+ "Search symbols, enter dollar amounts, set your horizon. Prices from Yahoo Finance. Risk-free from FRED. "
539
+ "Suggestions are built only from your current universe and optionally refined with finance embeddings."
 
540
  )
541
 
542
  with gr.Row():
543
  with gr.Column(scale=1):
 
544
  q = gr.Textbox(label="Search symbol")
545
  search_note = gr.Markdown(" ")
546
  matches = gr.Dropdown(choices=[], label="Matches")
 
548
  search_btn = gr.Button("Search")
549
  add_btn = gr.Button("Add selected to portfolio")
550
 
 
551
  gr.Markdown("### Portfolio positions — type dollar amounts (negatives allowed for shorts)")
552
  table = gr.Dataframe(
553
  headers=["ticker", "amount_usd"],
554
  datatype=["str", "number"],
555
+ row_count=0, col_count=(2, "fixed"),
 
556
  value=pd.DataFrame(columns=["ticker", "amount_usd"])
557
  )
558
 
 
559
  horizon = gr.Number(label="Horizon in years (1–100)", value=HORIZON_YEARS, precision=0)
560
  lookback = gr.Slider(1, 10, value=DEFAULT_LOOKBACK_YEARS, step=1, label="Lookback years for beta & sigma")
561
 
 
562
  gr.Markdown("### Suggestions")
563
  risk = gr.Radio(["Low", "Medium", "High"], value="Medium", label="Risk tolerance")
564
  use_st = gr.Checkbox(label="Use finance embeddings to refine picks", value=False)
 
565
  run_btn = gr.Button("Compute (build dataset & suggest)")
566
 
567
  with gr.Column(scale=1):
 
576
  value=pd.DataFrame(columns=POS_COLS),
577
  interactive=False
578
  )
579
+
580
+ # Suggestion picker
581
+ pick_slider = gr.Slider(1, 3, value=1, step=1, label="View suggested mix #", interactive=True)
582
+ pick_info = gr.Markdown("")
583
+ suggestions_tbl = gr.Dataframe(
584
+ label="Holdings (for selected pick) — percent & dollars",
585
  headers=SUG_COLS_HOLD,
586
  datatype=["number", "str", "number", "number"],
587
  col_count=(len(SUG_COLS_HOLD), "fixed"),
 
590
  )
591
  dl = gr.File(label="Generated dataset CSV", value=None, visible=True)
592
 
593
+ # States to support picker
594
+ sug_state = gr.State([])
595
+ plot_state = gr.State({})
 
596
 
597
+ # Wire up events
598
+ def do_search(query): note, options = search_tickers_cb(query); return note, gr.update(choices=options)
599
  search_btn.click(fn=do_search, inputs=q, outputs=[search_note, matches])
600
  add_btn.click(fn=add_symbol, inputs=[matches, table], outputs=[table, search_note])
601
  table.change(fn=lock_ticker_column, inputs=table, outputs=table)
 
604
  run_btn.click(
605
  fn=compute,
606
  inputs=[lookback, table, risk, use_st],
607
+ outputs=[plot, summary, universe_msg, positions, suggestions_tbl, dl, sug_state, plot_state, pick_info]
608
+ )
609
+
610
+ pick_slider.change(
611
+ fn=change_pick,
612
+ inputs=[pick_slider, sug_state, plot_state],
613
+ outputs=[suggestions_tbl, plot, pick_info]
614
  )
615
 
616
  if __name__ == "__main__":