Tulitula commited on
Commit
797be6b
·
verified ·
1 Parent(s): 53f36cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +455 -525
app.py CHANGED
@@ -1,5 +1,5 @@
1
- # app.py
2
- import os, io, math, json, time, random, warnings
3
  warnings.filterwarnings("ignore")
4
 
5
  from typing import List, Tuple, Dict, Optional
@@ -12,25 +12,57 @@ import requests
12
  import yfinance as yf
13
  import gradio as gr
14
 
15
- # Optional: finance embeddings for mild re-ranking of candidates
16
- try:
17
- from sentence_transformers import SentenceTransformer
18
- _EMB_MODEL = "FinLang/finance-embeddings-investopedia"
19
- _emb = SentenceTransformer(_EMB_MODEL)
20
- except Exception:
21
- _emb = None
22
 
23
- # ---------------- config ----------------
 
 
24
  DATA_DIR = "data"
25
- os.makedirs(DATA_DIR, exist_ok=True)
 
 
 
26
 
27
- DEFAULT_LOOKBACK_YEARS = 10
28
- MAX_TICKERS = 25
29
- MARKET_TICKER = "VOO"
30
 
 
31
  POS_COLS = ["ticker", "amount_usd", "weight_exposure", "beta"]
32
- SUG_TABLE_COLS = ["ticker", "weight_%", "amount_$"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
 
 
 
 
 
 
 
 
 
34
  FRED_MAP = [
35
  (1, "DGS1"),
36
  (2, "DGS2"),
@@ -43,16 +75,6 @@ FRED_MAP = [
43
  (100, "DGS30"),
44
  ]
45
 
46
- # ---------------- helpers ----------------
47
- def ensure_data_dir():
48
- os.makedirs(DATA_DIR, exist_ok=True)
49
-
50
- def empty_positions_df():
51
- return pd.DataFrame(columns=POS_COLS)
52
-
53
- def empty_suggest_df():
54
- return pd.DataFrame(columns=SUG_TABLE_COLS)
55
-
56
  def fred_series_for_horizon(years: float) -> str:
57
  y = max(1.0, min(100.0, float(years)))
58
  for cutoff, code in FRED_MAP:
@@ -71,125 +93,94 @@ def fetch_fred_yield_annual(code: str) -> float:
71
  except Exception:
72
  return 0.03
73
 
74
- def _extract_close(df: pd.DataFrame, tickers: List[str]) -> pd.DataFrame:
75
- """
76
- Make yfinance output consistently a (date x tickers) DataFrame of Close prices.
77
- Handles single/multi ticker and (Adj Close|Close) cases.
78
- """
79
  if isinstance(df, pd.Series):
80
- # Rare, but normalize
81
- out = df.to_frame(name=tickers[0])
82
- return out
83
-
84
  if isinstance(df.columns, pd.MultiIndex):
85
- lv0 = df.columns.get_level_values(0)
86
- if "Close" in lv0:
87
- px = df["Close"].copy()
88
- elif "Adj Close" in lv0:
89
- px = df["Adj Close"].copy()
90
- else:
91
- # Fallback to the first price-like level
92
- first = next((x for x in ["Adj Close", "Close", "Close*"] if x in lv0), None)
93
- if first is None:
94
- first = lv0[0]
95
- px = df[first].copy()
96
- px.columns = [str(c) for c in px.columns]
97
- return px
98
-
99
- # Single ticker case with flat columns
100
- candidates = [c for c in ["Close", "Adj Close"] if c in df.columns]
101
- if candidates:
102
- col = candidates[0]
103
- return df[[col]].rename(columns={col: tickers[0]})
104
-
105
- # Fallback: take first numeric column
106
- first_num = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
107
- if first_num:
108
- out = df[[first_num[0]]].copy()
109
- out.columns = [tickers[0]]
110
- return out
111
-
112
- raise ValueError("Could not extract a price column")
113
 
114
  def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
115
- start = pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=years, days=7)
116
- end = pd.Timestamp.today(tz="UTC")
 
 
 
117
  df = yf.download(
118
- list(dict.fromkeys(tickers)),
119
- start=start.date(),
120
- end=end.date(),
121
  interval="1mo",
122
  auto_adjust=True,
123
  progress=False,
124
- group_by="column",
125
  )
126
- px = _extract_close(df, tickers)
127
- px = px.dropna(how="all").ffill()
128
- return px
 
 
 
 
 
 
 
 
129
 
130
  def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
131
- return prices.pct_change().dropna()
132
-
133
- def annualize_mean(m):
134
- return np.asarray(m, dtype=float) * 12.0
135
-
136
- def annualize_sigma(s):
137
- return np.asarray(s, dtype=float) * math.sqrt(12.0)
138
-
139
- def yahoo_search(query: str):
140
- if not query or len(query.strip()) == 0:
141
- return []
142
- url = "https://query1.finance.yahoo.com/v1/finance/search"
143
- params = {"q": query.strip(), "quotesCount": 10, "newsCount": 0}
144
- headers = {"User-Agent": "Mozilla/5.0"}
145
- try:
146
- r = requests.get(url, params=params, headers=headers, timeout=10)
147
- r.raise_for_status()
148
- data = r.json()
149
- out = []
150
- for q in data.get("quotes", []):
151
- sym = q.get("symbol")
152
- name = q.get("shortname") or q.get("longname") or ""
153
- exch = q.get("exchDisp") or ""
154
- if sym and sym.isascii():
155
- out.append({"symbol": sym, "name": name, "exchange": exch})
156
- if not out:
157
- out = [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n a"}]
158
- return out[:10]
159
- except Exception:
160
- return [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n a"}]
161
-
162
- def validate_tickers(symbols: List[str], years: int) -> List[str]:
163
- base = [s for s in dict.fromkeys(symbols) if s]
164
- try:
165
- px = fetch_prices_monthly(base + [MARKET_TICKER], years)
166
- except Exception:
167
- return []
168
- ok = [s for s in base if s in px.columns]
169
- return ok
170
-
171
- # -------------- aligned moments --------------
172
- def get_aligned_monthly_returns(symbols: List[str], years: int) -> pd.DataFrame:
173
- uniq = [c for c in dict.fromkeys(symbols) if c != MARKET_TICKER]
174
- tickers = uniq + [MARKET_TICKER]
175
- px = fetch_prices_monthly(tickers, years)
176
  rets = monthly_returns(px)
177
- cols = [c for c in uniq if c in rets.columns] + ([MARKET_TICKER] if MARKET_TICKER in rets.columns else [])
 
 
 
 
 
 
 
 
178
  R = rets[cols].dropna(how="any")
179
- return R.loc[:, ~R.columns.duplicated()]
 
180
 
181
  def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
182
- R = get_aligned_monthly_returns(symbols + [MARKET_TICKER], years)
183
- if MARKET_TICKER not in R.columns or R.shape[0] < 3:
184
- raise ValueError("Not enough aligned returns (market missing or few rows).")
185
  rf_m = rf_ann / 12.0
186
 
187
- m = R[MARKET_TICKER]
188
  if isinstance(m, pd.DataFrame):
189
  m = m.iloc[:, 0].squeeze()
190
 
191
- mu_m_ann = float(annualize_mean(m.mean()))
192
- sigma_m_ann = float(annualize_sigma(m.std(ddof=1)))
193
  erp_ann = float(mu_m_ann - rf_ann)
194
 
195
  ex_m = m - rf_m
@@ -197,21 +188,27 @@ def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
197
  var_m = max(var_m, 1e-8)
198
 
199
  betas: Dict[str, float] = {}
200
- for s in [c for c in R.columns if c != MARKET_TICKER]:
201
  ex_s = R[s] - rf_m
202
- betas[s] = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1] / var_m)
203
-
204
- betas[MARKET_TICKER] = 1.0 # by definition
205
-
206
- asset_cols = [c for c in R.columns if c != MARKET_TICKER]
207
- if asset_cols:
208
- cov_m = np.cov(R[asset_cols].values.T, ddof=1)
209
- covA = pd.DataFrame(cov_m * 12.0, index=asset_cols, columns=asset_cols)
210
- else:
211
- covA = pd.DataFrame([], index=[], columns=[])
212
-
213
- return {"betas": betas, "cov_ann": covA, "erp_ann": erp_ann, "sigma_m_ann": sigma_m_ann}
214
-
 
 
 
 
 
 
215
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
216
  return float(rf_ann + beta * erp_ann)
217
 
@@ -221,239 +218,184 @@ def portfolio_stats(weights: Dict[str, float],
221
  rf_ann: float,
222
  erp_ann: float) -> Tuple[float, float, float]:
223
  tickers = list(weights.keys())
 
 
224
  w = np.array([weights[t] for t in tickers], dtype=float)
225
  gross = float(np.sum(np.abs(w)))
226
  if gross == 0:
227
- return 0.0, 0.0, 0.0
228
  w_expo = w / gross
229
  beta_p = float(np.dot([betas.get(t, 0.0) for t in tickers], w_expo))
230
- er_p = capm_er(beta_p, rf_ann, erp_ann) # CAPM expected return
231
  cov = cov_ann.reindex(index=tickers, columns=tickers).fillna(0.0).to_numpy()
232
  sigma_p = math.sqrt(float(max(w_expo.T @ cov @ w_expo, 0.0)))
233
- return beta_p, er_p, sigma_p
234
-
235
- # -------------- CML helpers --------------
236
- def efficient_same_sigma(sigma_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
237
- if sigma_mkt <= 1e-12:
238
- return 0.0, 1.0, rf_ann
239
- a = sigma_target / sigma_mkt
240
- return a, 1.0 - a, rf_ann + a * erp_ann
241
-
242
- def efficient_same_return(mu_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
243
- if abs(erp_ann) <= 1e-12:
244
- return 0.0, 1.0, rf_ann
245
- a = (mu_target - rf_ann) / erp_ann
246
- return a, 1.0 - a, abs(a) * sigma_mkt
247
-
248
- def _pct(x: float) -> float:
249
- return float(x) * 100.0
250
-
251
- def plot_cml(
252
- rf_ann, erp_ann, sigma_mkt,
253
- pt_sigma, pt_mu, # <-- portfolio CAPM point
254
- same_sigma_sigma, same_sigma_mu,
255
- same_mu_sigma, same_mu_mu,
256
- sugg_sigma=None, sugg_mu=None
257
- ) -> Image.Image:
258
- fig = plt.figure(figsize=(6.4, 4.2), dpi=140)
259
-
260
- xmax = max(0.30, sigma_mkt * 2.0, pt_sigma * 1.4, same_mu_sigma * 1.4, same_sigma_sigma * 1.4, (sugg_sigma or 0.0) * 1.4)
261
- xs = np.linspace(0, xmax, 200)
262
  slope = erp_ann / max(sigma_mkt, 1e-12)
 
 
263
  cml = rf_ann + slope * xs
264
  plt.plot(_pct(xs), _pct(cml), label="CML via Market", linewidth=1.8)
265
 
266
- # Key points
267
- plt.scatter([0.0], [_pct(rf_ann)], label="Risk-free (FRED)")
268
- plt.scatter([_pct(sigma_mkt)], [_pct(rf_ann + erp_ann)], label=f"Market {MARKET_TICKER}")
269
- plt.scatter([_pct(pt_sigma)], [_pct(pt_mu)], label="Your portfolio (CAPM)")
270
 
271
- plt.scatter([_pct(same_sigma_sigma)], [_pct(same_sigma_mu)], label="Efficient same σ")
272
- plt.scatter([_pct(same_mu_sigma)], [_pct(same_mu_mu)], label="Efficient same return")
273
- if sugg_sigma is not None and sugg_mu is not None:
274
- plt.scatter([_pct(sugg_sigma)], [_pct(sugg_mu)], label="Suggestion")
275
 
276
- plt.xlabel("σ (annualized, %)")
 
 
 
 
 
 
277
  plt.ylabel("Expected return (annual, %)")
278
  plt.legend(loc="best", fontsize=8)
279
  plt.tight_layout()
280
-
281
  buf = io.BytesIO()
282
  plt.savefig(buf, format="png")
283
  plt.close(fig)
284
  buf.seek(0)
285
  return Image.open(buf)
286
 
287
- # -------------- synthetic dataset for suggestions --------------
288
- def synth_profile(rng: np.random.Generator) -> str:
289
- risk = rng.choice(["cautious", "balanced", "moderate", "growth", "aggressive"])
290
- horizon = rng.choice(["three years", "five years", "seven years", "ten years", "fifteen years"])
291
- goal = rng.choice(["retirement savings", "first home", "education fund", "wealth building", "travel fund", "emergency buffer"])
292
- return f"{risk} investor, {horizon} horizon, goal is {goal}."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
- def build_synthetic_dataset(universe: List[str], years: int, rf_ann: float, erp_ann: float, covA: pd.DataFrame, betas: Dict[str, float]) -> pd.DataFrame:
295
- # build 1,000 random portfolios over the user universe (CAPM ER, cov-based sigma)
296
- rng = np.random.default_rng(42 + int(time.time()) % 10000)
297
- rows = []
298
- for i in range(1000):
299
- k = rng.integers(low=min(2, len(universe)), high=min(8, len(universe)) + 1)
300
- picks = list(rng.choice(universe, size=k, replace=False))
301
- signs = rng.choice([-1.0, 1.0], size=k, p=[0.25, 0.75])
302
- raw = rng.dirichlet(np.ones(k))
303
- gross = 1.0 + float(rng.gamma(2.0, 0.5))
304
- w = gross * signs * raw # exposure weights that sum (in abs) to gross
305
 
306
- wmap = {picks[j]: float(w[j]) for j in range(k)}
307
- beta_p, er_p, sigma_p = portfolio_stats(wmap, covA, betas, rf_ann, erp_ann)
 
 
 
 
 
 
 
 
308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  rows.append({
310
- "id": i,
311
- "profile_text": synth_profile(rng),
312
  "tickers": ",".join(picks),
313
- "weights": ",".join(f"{x:.6f}" for x in w),
314
- "beta_p": beta_p,
315
- "er_p": er_p,
316
- "sigma_p": sigma_p
 
317
  })
318
- return pd.DataFrame(rows)
319
-
320
- def _row_to_exposures(row: pd.Series, universe: List[str]) -> Optional[np.ndarray]:
321
- try:
322
- ts = [t.strip() for t in str(row["tickers"]).split(",")]
323
- ws = [float(x) for x in str(row["weights"]).split(",")]
324
- wmap = {t: ws[i] for i, t in enumerate(ts) if i < len(ws)}
325
- w = np.array([wmap.get(t, 0.0) for t in universe], dtype=float)
326
- gross = float(np.sum(np.abs(w)))
327
- if gross <= 1e-12:
328
- return None
329
- return w / gross
330
- except Exception:
331
- return None
332
-
333
- def _risk_query_text(risk: str) -> str:
334
- if risk == "Low":
335
- return "conservative low-volatility long-term capital preservation diversified investment grade"
336
- if risk == "High":
337
- return "aggressive high risk high growth momentum speculative tech heavy"
338
- return "balanced moderate risk growth and income diversified core equities and bonds"
339
-
340
- def _embed_scores(texts: List[str], query: str) -> np.ndarray:
341
- if _emb is None:
342
- return np.zeros(len(texts), dtype=float)
343
- qv = _emb.encode([query], normalize_embeddings=True)[0]
344
- M = _emb.encode(texts, normalize_embeddings=True)
345
- sims = (M @ qv).astype(float)
346
- return sims
347
-
348
- def make_suggestions(csv_path: str,
349
- universe: List[str],
350
- risk: str,
351
- use_embeddings: bool) -> List[Dict]:
352
- """
353
- Return a list of 3 suggestions. Each item:
354
- {"weights": {ticker: expo}, "er": float, "sigma": float, "beta": float, "row_text": str}
355
- """
356
- try:
357
- df = pd.read_csv(csv_path)
358
- except Exception:
359
- return []
360
 
361
- # Keep only rows that map nicely to current universe
362
- rows = []
363
- exps = []
364
- for _, r in df.iterrows():
365
- x = _row_to_exposures(r, universe)
366
- if x is None:
367
- continue
368
- rows.append(r)
369
- exps.append(x)
370
- if not rows:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  return []
372
-
373
- exps = np.vstack(exps)
374
- sigs = np.array([float(r["sigma_p"]) for r in rows])
375
- ers = np.array([float(r["er_p"]) for r in rows])
376
-
377
- # Choose a target sigma by risk quantile
378
- qmap = {"Low": 0.25, "Medium": 0.50, "High": 0.85}
379
- q = qmap.get(risk, 0.50)
380
- target_sigma = float(np.quantile(sigs, q=q))
381
-
382
- # Rank by closeness in sigma to target
383
- base_idx = np.argsort(np.abs(sigs - target_sigma))
384
-
385
- # Optional: light re-ranking using embeddings to prefer text that matches risk intent
386
- if use_embeddings:
387
- texts = [str(rows[i]["profile_text"]) for i in base_idx[:120]]
388
- sims = _embed_scores(texts, _risk_query_text(risk))
389
- # Blend: 80% sigma closeness (smaller better) and -20% similarity (larger better)
390
- closeness = np.abs(sigs[base_idx[:120]] - target_sigma)
391
- score = 0.8 * (closeness / (closeness.max() + 1e-9)) - 0.2 * sims
392
- rerank_local = np.argsort(score)
393
- idx = base_idx[:120][rerank_local]
394
- else:
395
- idx = base_idx
396
-
397
- # Take top 3 diverse by exposure distance
398
- picks, chosen = [], []
399
- for i in idx:
400
- wvec = exps[i]
401
- # enforce some diversity
402
- ok = True
403
- for j in chosen:
404
- if np.linalg.norm(wvec - exps[j]) < 0.25:
405
- ok = False
406
- break
407
- if not ok:
408
- continue
409
- chosen.append(i)
410
- r = rows[i]
411
- wmap = {universe[k]: float(wvec[k]) for k in range(len(universe)) if abs(wvec[k]) > 1e-4}
412
- picks.append({
413
- "weights": wmap,
414
- "er": float(r["er_p"]),
415
- "sigma": float(r["sigma_p"]),
416
- "beta": float(r["beta_p"]),
417
- "row_text": str(r["profile_text"])
418
- })
419
- if len(picks) == 3:
420
- break
421
  return picks
422
 
423
- # -------------- formatting helpers --------------
424
- def fmt_pct(x: float, dp: int = 2) -> str:
425
- return f"{x*100:.{dp}f}%"
426
-
427
- def build_summary_md(lookback, horizon, rf, rf_code, erp, sigma_mkt,
428
- beta_p, sigma_hist, mu_hist, mu_capm,
429
- a_sigma, b_sigma, mu_eff_sigma,
430
- a_mu, b_mu, sigma_eff_mu) -> str:
431
- lines = []
432
- lines.append("### Inputs")
433
- lines.append(f"- Lookback years **{lookback}**")
434
- lines.append(f"- Horizon years **{horizon}**")
435
- lines.append(f"- Risk-free **{fmt_pct(rf)}** from **{rf_code}**")
436
- lines.append(f"- Market ERP **{fmt_pct(erp)}**")
437
- lines.append(f"- Market σ **{fmt_pct(sigma_mkt)}**")
438
- lines.append("")
439
- lines.append("### Your portfolio (CAPM expectations)")
440
- lines.append(f"- Beta **{beta_p:.2f}**")
441
- lines.append(f"- σ (historical) **{fmt_pct(sigma_hist)}**")
442
- lines.append(f"- Expected return (historical) **{fmt_pct(mu_hist)}**")
443
- lines.append(f"- Expected return (CAPM / SML) **{fmt_pct(mu_capm)}**")
444
- lines.append("")
445
- lines.append("### Efficient alternatives on CML")
446
- lines.append(f"- Same σ as your portfolio → Market weight **{a_sigma:.2f}**, Bills weight **{b_sigma:.2f}**, return **{fmt_pct(mu_eff_sigma)}**")
447
- lines.append(f"- Same return (CAPM) → Market weight **{a_mu:.2f}**, Bills weight **{b_mu:.2f}**, σ **{fmt_pct(sigma_eff_mu)}**")
448
- return "\n".join(lines)
449
-
450
- # -------------- stateful globals on launch --------------
451
- ensure_data_dir()
452
- HORIZON_YEARS = 10
453
- RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
454
- RF_ANN = fetch_fred_yield_annual(RF_CODE)
455
-
456
- # -------------- gradio callbacks --------------
457
  def search_tickers_cb(q: str):
458
  hits = yahoo_search(q)
459
  if not hits:
@@ -463,12 +405,10 @@ def search_tickers_cb(q: str):
463
 
464
  def add_symbol(selection: str, table: pd.DataFrame):
465
  if not selection:
466
- return table, "Pick a row from Matches first"
467
  symbol = selection.split("|")[0].strip().upper()
468
  current = [] if table is None or len(table) == 0 else [str(x).upper() for x in table["ticker"].tolist() if str(x) != "nan"]
469
  tickers = current if symbol in current else current + [symbol]
470
-
471
- # validate against yfinance (with market ticker alongside to force download structure)
472
  val = validate_tickers(tickers, years=DEFAULT_LOOKBACK_YEARS)
473
  tickers = [t for t in tickers if t in val]
474
  amt_map = {}
@@ -482,7 +422,7 @@ def add_symbol(selection: str, table: pd.DataFrame):
482
  if len(new_table) > MAX_TICKERS:
483
  new_table = new_table.iloc[:MAX_TICKERS]
484
  msg = f"Reached max of {MAX_TICKERS}"
485
- return new_table, msg
486
 
487
  def lock_ticker_column(tb: pd.DataFrame):
488
  if tb is None or len(tb) == 0:
@@ -499,197 +439,187 @@ def set_horizon(years: float):
499
  code = fred_series_for_horizon(y)
500
  rf = fetch_fred_yield_annual(code)
501
  global HORIZON_YEARS, RF_CODE, RF_ANN
502
- HORIZON_YEARS = int(round(y))
503
  RF_CODE = code
504
  RF_ANN = rf
505
- return f"Risk free series {code}. Latest annual rate {rf:.2%}. Using this for CAPM."
506
-
507
- def _build_dataset_path() -> str:
508
- return os.path.join(DATA_DIR, f"investor_profiles_{hex(random.getrandbits(32))[2:]}.csv")
509
-
510
- def compute(
511
- years_lookback: int,
512
- table: pd.DataFrame,
513
- risk_choice: str,
514
- use_embeddings: bool
515
- ):
516
- # --- sanitize input table ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  if table is None or len(table) == 0:
518
- return None, "Add at least one ticker", "Universe empty", empty_positions_df(), gr.update(choices=[], value=None), empty_suggest_df(), None, {}
519
 
520
  df = table.dropna()
521
  df["ticker"] = df["ticker"].astype(str).str.upper().str.strip()
522
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
523
  symbols = [t for t in df["ticker"].tolist() if t]
524
- if len(symbols) == 0:
525
- return None, "Add at least one ticker", "Universe empty", empty_positions_df(), gr.update(choices=[], value=None), empty_suggest_df(), None, {}
526
 
527
  symbols = validate_tickers(symbols, years_lookback)
528
  if len(symbols) == 0:
529
- return None, "Could not validate any tickers", "Universe invalid", empty_positions_df(), gr.update(choices=[], value=None), empty_suggest_df(), None, {}
530
-
531
- universe = list(sorted(set([s for s in symbols if s != MARKET_TICKER] + [MARKET_TICKER])))[:MAX_TICKERS]
532
 
533
- df = df[df["ticker"].isin(symbols)].copy()
534
- amounts = {r["ticker"]: float(r["amount_usd"]) for _, r in df.iterrows()}
535
  gross = sum(abs(v) for v in amounts.values())
536
- if gross <= 1e-12:
537
- return None, "All amounts are zero", "Universe ok", empty_positions_df(), gr.update(choices=[], value=None), empty_suggest_df(), None, {}
 
538
 
539
- # --- CAPM ingredients ---
540
  rf_ann = RF_ANN
541
  moms = estimate_all_moments_aligned(symbols, years_lookback, rf_ann)
542
- betas, covA, erp_ann, sigma_mkt = moms["betas"], moms["cov_ann"], moms["erp_ann"], moms["sigma_m_ann"]
 
543
 
544
- # portfolio weights/exposures
545
- weights = {k: v / gross for k, v in amounts.items()}
546
- beta_p, mu_capm, sigma_p = portfolio_stats(weights, covA, betas, rf_ann, erp_ann)
547
 
548
- # historical mean (for info only)
549
- try:
550
- R = get_aligned_monthly_returns(symbols, years_lookback)
551
- mu_hist = float(annualize_mean(R[symbols].mean().dot(np.array([weights[s] for s in symbols]))))
552
- sigma_hist = sigma_p # same sigma as built from covA
553
- except Exception:
554
- mu_hist = mu_capm
555
- sigma_hist = sigma_p
556
-
557
- # efficient points on CML (use CAPM target)
558
- a_sigma, b_sigma, mu_eff_sigma = efficient_same_sigma(sigma_p, rf_ann, erp_ann, sigma_mkt)
559
- a_mu, b_mu, sigma_eff_mu = efficient_same_return(mu_capm, rf_ann, erp_ann, sigma_mkt)
560
-
561
- # --- Build dataset once for this run (universe-specific) ---
562
- ds_path = _build_dataset_path()
563
- synth_df = build_synthetic_dataset(
564
- universe=[u for u in universe if u != MARKET_TICKER],
565
- years=years_lookback,
566
- rf_ann=rf_ann,
567
- erp_ann=erp_ann,
568
- covA=covA,
569
- betas=betas
570
- )
571
- synth_df.to_csv(ds_path, index=False)
572
-
573
- # --- Suggestions (3 picks) ---
574
- picks = make_suggestions(ds_path, [u for u in universe if u != MARKET_TICKER], risk_choice, use_embeddings)
575
- if not picks:
576
- pick_choices = []
577
- sugg_table = empty_suggest_df()
578
- sugg_sigma = None
579
- sugg_mu = None
580
- else:
581
- pick_choices = [f"Pick #{i+1}" for i in range(len(picks))]
582
- # default selection = first pick
583
- first = picks[0]
584
- sugg_sigma = float(first["sigma"])
585
- sugg_mu = float(first["er"])
586
- sugg_table = _pick_table(first, amounts)
587
-
588
- # --- Plot with CAPM portfolio and suggestion point (if any) ---
589
- img = plot_cml(
590
- rf_ann, erp_ann, sigma_mkt,
591
- pt_sigma=sigma_p, pt_mu=mu_capm,
592
- same_sigma_sigma=sigma_p, same_sigma_mu=mu_eff_sigma,
593
- same_mu_sigma=sigma_eff_mu, same_mu_mu=mu_capm,
594
- sugg_sigma=sugg_sigma, sugg_mu=sugg_mu
595
- )
596
-
597
- # --- Summary text ---
598
- summary = build_summary_md(
599
- years_lookback, HORIZON_YEARS, rf_ann, RF_CODE, erp_ann, sigma_mkt,
600
- beta_p, sigma_hist, mu_hist, mu_capm,
601
- a_sigma, b_sigma, mu_eff_sigma,
602
- a_mu, b_mu, sigma_eff_mu
603
- )
604
-
605
- # positions table
606
  rows = []
607
  for t in symbols:
608
- beta_val = 1.0 if t == MARKET_TICKER else betas.get(t, np.nan)
609
  rows.append({
610
  "ticker": t,
611
  "amount_usd": amounts.get(t, 0.0),
612
- "weight_exposure": weights.get(t, 0.0),
613
- "beta": beta_val,
614
  })
615
  pos_table = pd.DataFrame(rows, columns=POS_COLS)
616
 
617
- uni_msg = f"Universe set to {', '.join(universe)}"
618
- # Return suggestions state so the picker can swap views
619
- suggestions_state = {"picks": picks, "amounts": amounts, "rf": rf_ann, "erp": erp_ann, "sigma_mkt": sigma_mkt, "mu_capm": mu_capm, "sigma_p": sigma_p}
620
- return img, summary, uni_msg, pos_table, gr.update(choices=pick_choices, value=(pick_choices[0] if pick_choices else None), interactive=bool(pick_choices)), sugg_table, ds_path, suggestions_state
621
-
622
- def _pick_table(pick: Dict, amounts_map: Dict[str, float]) -> pd.DataFrame:
623
- gross = float(sum(abs(v) for v in amounts_map.values()))
624
- wmap = pick["weights"]
625
- # normalize to exposures sum of abs = 1 for display
626
- gross_w = sum(abs(v) for v in wmap.values())
627
- if gross_w <= 1e-12:
628
- return empty_suggest_df()
629
- w_norm = {k: v / gross_w for k, v in wmap.items()}
630
- rows = []
631
- for t, w in sorted(w_norm.items(), key=lambda kv: -abs(kv[1])):
632
- rows.append({
633
- "ticker": t,
634
- "weight_%": 100.0 * float(w),
635
- "amount_$": float(w) * gross
636
- })
637
- df = pd.DataFrame(rows, columns=SUG_TABLE_COLS)
638
- return df
639
-
640
- def on_select_pick(choice: Optional[str], suggestions_state: Dict):
641
- if not choice or not suggestions_state or not suggestions_state.get("picks"):
642
- return empty_suggest_df(), gr.update(value=None)
643
- idx = int(choice.split("#")[1]) - 1
644
- idx = max(0, min(idx, len(suggestions_state["picks"]) - 1))
645
- pick = suggestions_state["picks"][idx]
646
- table = _pick_table(pick, suggestions_state["amounts"])
647
-
648
- # Update the plot with the chosen suggestion dot
649
- img = plot_cml(
650
- suggestions_state["rf"],
651
- suggestions_state["erp"],
652
- suggestions_state["sigma_mkt"],
653
- pt_sigma=suggestions_state["sigma_p"],
654
- pt_mu=suggestions_state["mu_capm"],
655
- same_sigma_sigma=suggestions_state["sigma_p"],
656
- same_sigma_mu=efficient_same_sigma(suggestions_state["sigma_p"], suggestions_state["rf"], suggestions_state["erp"], suggestions_state["sigma_mkt"])[2],
657
- same_mu_sigma=efficient_same_return(suggestions_state["mu_capm"], suggestions_state["rf"], suggestions_state["erp"], suggestions_state["sigma_mkt"])[2],
658
- same_mu_mu=suggestions_state["mu_capm"],
659
- sugg_sigma=float(pick["sigma"]),
660
- sugg_mu=float(pick["er"]),
661
  )
662
- return table, img
663
 
664
- # -------------- UI --------------
665
- with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
666
- with gr.Accordion("About (assignment section 1)", open=False):
667
- gr.Markdown(
668
- "**Modality:** Text\n\n"
669
- "**Model:** FinLang/finance-embeddings-investopedia (optional, for mild re-ranking of dataset suggestions).\n\n"
670
- "**Use case:** User enters tickers and dollar amounts; the app computes CAPM expectations and shows the "
671
- "Capital Market Line. From a synthetic dataset (1,000 portfolios generated over the user’s universe), "
672
- "the system returns 3 similar portfolios (Low/Medium/High risk picks). The user can flip between the "
673
- "suggested picks and see holdings in % and $ plus where the suggestion sits on the CML.\n"
674
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
 
 
676
  gr.Markdown(
677
  "## Efficient Portfolio Advisor\n"
678
- "Search symbols, enter dollar amounts, set your horizon. Prices from Yahoo Finance. Risk-free from FRED. "
679
- "Low/Medium/High suggestions are chosen only from a 1,000-row dataset generated from your current universe, "
680
- "optionally refined with finance embeddings."
681
  )
682
 
683
  with gr.Row():
684
  with gr.Column(scale=1):
685
  q = gr.Textbox(label="Search symbol")
686
  search_note = gr.Markdown()
687
- matches = gr.Dropdown(choices=[], label="Matches")
688
- with gr.Row():
689
- search_btn = gr.Button("Search")
690
- add_btn = gr.Button("Add selected to portfolio")
691
 
692
- gr.Markdown("### Portfolio positions type dollar amounts (negatives allowed for shorts)")
693
  table = gr.Dataframe(
694
  headers=["ticker", "amount_usd"],
695
  datatype=["str", "number"],
@@ -698,17 +628,13 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
698
  )
699
 
700
  horizon = gr.Number(label="Horizon in years (1–100)", value=HORIZON_YEARS, precision=0)
701
- lookback = gr.Slider(1, 10, value=DEFAULT_LOOKBACK_YEARS, step=1, label="Lookback years for beta & sigma")
702
-
703
- gr.Markdown("### Suggestions")
704
- risk = gr.Radio(["Low", "Medium", "High"], value="Medium", label="Risk tolerance")
705
- use_emb = gr.Checkbox(label="Use finance embeddings to refine picks", value=True)
706
 
707
- run_btn = gr.Button("Compute (build dataset & suggest)", variant="primary")
708
 
709
  with gr.Column(scale=1):
710
- plot = gr.Image(label="Capital Market Line (CML)", type="pil")
711
- summary = gr.Markdown(label="Inputs & CAPM expectations")
712
  universe_msg = gr.Textbox(label="Universe status", interactive=False)
713
 
714
  positions = gr.Dataframe(
@@ -720,43 +646,47 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
720
  interactive=False
721
  )
722
 
723
- with gr.Row():
724
- with gr.Column(scale=1):
725
- pick_select = gr.Radio(choices=[], label="Suggested pick (flip between #1 / #2 / #3)", interactive=False)
726
- with gr.Column(scale=1):
727
- sugg_table = gr.Dataframe(
728
- label="Suggestion holdings — % and $",
729
- headers=SUG_TABLE_COLS,
 
730
  datatype=["str", "number", "number"],
731
- col_count=(len(SUG_TABLE_COLS), "fixed"),
732
  value=empty_suggest_df(),
733
  interactive=False
734
  )
735
- dl = gr.File(label="Generated dataset CSV", value=None, visible=True)
736
 
737
- # hidden state for suggestions
738
- suggestions_state = gr.State({})
739
-
740
- # wire events
741
  def do_search(query):
742
  note, options = search_tickers_cb(query)
743
- return note, gr.update(choices=options)
 
744
 
745
  search_btn.click(fn=do_search, inputs=q, outputs=[search_note, matches])
746
- add_btn.click(fn=add_symbol, inputs=[matches, table], outputs=[table, search_note])
747
  table.change(fn=lock_ticker_column, inputs=table, outputs=table)
748
  horizon.change(fn=set_horizon, inputs=horizon, outputs=universe_msg)
749
 
750
  run_btn.click(
751
  fn=compute,
752
- inputs=[lookback, table, risk, use_emb],
753
- outputs=[plot, summary, universe_msg, positions, pick_select, sugg_table, dl, suggestions_state]
754
  )
755
 
756
- pick_select.change(
757
- fn=on_select_pick,
758
- inputs=[pick_select, suggestions_state],
759
- outputs=[sugg_table, plot]
 
 
 
 
 
 
760
  )
761
 
762
  if __name__ == "__main__":
 
1
+
2
+ import os, io, math, json, warnings
3
  warnings.filterwarnings("ignore")
4
 
5
  from typing import List, Tuple, Dict, Optional
 
12
  import yfinance as yf
13
  import gradio as gr
14
 
15
+ from sentence_transformers import SentenceTransformer
 
 
 
 
 
 
16
 
17
+ # ==============================
18
+ # Config
19
+ # ==============================
20
  DATA_DIR = "data"
21
+ DATASET_PATH = os.path.join(DATA_DIR, "investor_profiles.csv")
22
+
23
+ MAX_TICKERS = 30
24
+ DEFAULT_LOOKBACK_YEARS = 5
25
 
26
+ # Try these in order for "market"
27
+ MARKET_CANDIDATES = ["VOO", "SPY", "IVV"]
 
28
 
29
+ # Gradio table schemas
30
  POS_COLS = ["ticker", "amount_usd", "weight_exposure", "beta"]
31
+ SUG_COLS = ["ticker", "weight_pct", "amount_usd"]
32
+
33
+ # Globals (updated on events)
34
+ HORIZON_YEARS = 5.0
35
+ RF_CODE = "DGS5"
36
+ RF_ANN = 0.03
37
+
38
+ # Lazy-loaded embedding model
39
+ _EMB_MODEL = None
40
+
41
+ # ==============================
42
+ # Small utils
43
+ # ==============================
44
+ def ensure_data_dir():
45
+ os.makedirs(DATA_DIR, exist_ok=True)
46
+
47
+ def fmt_pct(x: float) -> str:
48
+ try:
49
+ return f"{float(x)*100:.2f}%"
50
+ except Exception:
51
+ return "0.00%"
52
+
53
+ def _pct(x):
54
+ """Return x in percent; accepts float or numpy array."""
55
+ return np.asarray(x, dtype=float) * 100.0
56
 
57
+ def empty_positions_df():
58
+ return pd.DataFrame(columns=POS_COLS)
59
+
60
+ def empty_suggest_df():
61
+ return pd.DataFrame(columns=SUG_COLS)
62
+
63
+ # ==============================
64
+ # Risk-free via FRED
65
+ # ==============================
66
  FRED_MAP = [
67
  (1, "DGS1"),
68
  (2, "DGS2"),
 
75
  (100, "DGS30"),
76
  ]
77
 
 
 
 
 
 
 
 
 
 
 
78
  def fred_series_for_horizon(years: float) -> str:
79
  y = max(1.0, min(100.0, float(years)))
80
  for cutoff, code in FRED_MAP:
 
93
  except Exception:
94
  return 0.03
95
 
96
+ # ==============================
97
+ # Prices & returns (robust to yfinance shapes)
98
+ # ==============================
99
+ def _extract_close(df: pd.DataFrame) -> pd.DataFrame:
 
100
  if isinstance(df, pd.Series):
101
+ return df.to_frame()
 
 
 
102
  if isinstance(df.columns, pd.MultiIndex):
103
+ for key in ["Close", "Adj Close"]:
104
+ try:
105
+ c = df.xs(key, axis=1, level=0)
106
+ return c
107
+ except Exception:
108
+ pass
109
+ # fallback: take first level
110
+ lvl0 = list(dict.fromkeys(df.columns.get_level_values(0)))
111
+ return df.xs(lvl0[0], axis=1, level=0)
112
+ else:
113
+ if "Close" in df.columns:
114
+ return df[["Close"]]
115
+ if "Adj Close" in df.columns:
116
+ c = df[["Adj Close"]].copy()
117
+ c.columns = ["Close"]
118
+ return c
119
+ return df
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
122
+ tickers = list(dict.fromkeys([t for t in tickers if t])) # unique, keep order
123
+ if not tickers:
124
+ return pd.DataFrame()
125
+ start = (pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=years, days=7)).date()
126
+ end = pd.Timestamp.today(tz="UTC").date()
127
  df = yf.download(
128
+ tickers,
129
+ start=start,
130
+ end=end,
131
  interval="1mo",
132
  auto_adjust=True,
133
  progress=False,
134
+ group_by="column"
135
  )
136
+ if isinstance(df, pd.DataFrame):
137
+ df = _extract_close(df)
138
+ df = df.dropna(how="all").fillna(method="ffill")
139
+ # When single ticker, columns might be 1 col named by ticker or "Close"
140
+ if df.shape[1] == 1:
141
+ col = df.columns[0]
142
+ if col in ("Close", "Adj Close"):
143
+ # rename to ticker if only one requested
144
+ if len(tickers) == 1:
145
+ df.columns = [tickers[0]]
146
+ return df
147
 
148
  def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
149
+ return prices.pct_change().dropna(how="all")
150
+
151
+ # ==============================
152
+ # Aligned moments (market chosen dynamically)
153
+ # ==============================
154
+ def get_aligned_monthly_returns(symbols: List[str], years: int) -> Tuple[pd.DataFrame, str]:
155
+ uniq = [c for c in dict.fromkeys(symbols)]
156
+ want = list(dict.fromkeys(uniq + MARKET_CANDIDATES))
157
+ px = fetch_prices_monthly(want, years)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  rets = monthly_returns(px)
159
+ # pick first available market
160
+ market = None
161
+ for m in MARKET_CANDIDATES:
162
+ if m in rets.columns:
163
+ market = m
164
+ break
165
+ if market is None:
166
+ raise ValueError("No market proxy (VOO/SPY/IVV) found in returned data.")
167
+ cols = [c for c in uniq if c in rets.columns] + [market]
168
  R = rets[cols].dropna(how="any")
169
+ R = R.loc[:, ~R.columns.duplicated()]
170
+ return R, market
171
 
172
  def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
173
+ R, market = get_aligned_monthly_returns(symbols, years)
174
+ if market not in R.columns or R.shape[0] < 3:
175
+ raise ValueError("Not enough aligned data.")
176
  rf_m = rf_ann / 12.0
177
 
178
+ m = R[market]
179
  if isinstance(m, pd.DataFrame):
180
  m = m.iloc[:, 0].squeeze()
181
 
182
+ mu_m_ann = float(m.mean() * 12.0)
183
+ sigma_m_ann = float(m.std(ddof=1) * math.sqrt(12.0))
184
  erp_ann = float(mu_m_ann - rf_ann)
185
 
186
  ex_m = m - rf_m
 
188
  var_m = max(var_m, 1e-8)
189
 
190
  betas: Dict[str, float] = {}
191
+ for s in [c for c in R.columns if c != market]:
192
  ex_s = R[s] - rf_m
193
+ b = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1] / var_m)
194
+ betas[s] = b
195
+ betas[market] = 1.0
196
+
197
+ asset_cols = [c for c in R.columns if c != market]
198
+ cov_m = np.cov(R[asset_cols].values.T, ddof=1) if asset_cols else np.zeros((0, 0))
199
+ covA = pd.DataFrame(cov_m * 12.0, index=asset_cols, columns=asset_cols)
200
+
201
+ return {
202
+ "betas": betas,
203
+ "cov_ann": covA,
204
+ "erp_ann": erp_ann,
205
+ "sigma_m_ann": sigma_m_ann,
206
+ "market": market,
207
+ }
208
+
209
+ # ==============================
210
+ # Portfolio stats (CAPM)
211
+ # ==============================
212
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
213
  return float(rf_ann + beta * erp_ann)
214
 
 
218
  rf_ann: float,
219
  erp_ann: float) -> Tuple[float, float, float]:
220
  tickers = list(weights.keys())
221
+ if not tickers:
222
+ return 0.0, rf_ann, 0.0
223
  w = np.array([weights[t] for t in tickers], dtype=float)
224
  gross = float(np.sum(np.abs(w)))
225
  if gross == 0:
226
+ return 0.0, rf_ann, 0.0
227
  w_expo = w / gross
228
  beta_p = float(np.dot([betas.get(t, 0.0) for t in tickers], w_expo))
229
+ er_capm = capm_er(beta_p, rf_ann, erp_ann)
230
  cov = cov_ann.reindex(index=tickers, columns=tickers).fillna(0.0).to_numpy()
231
  sigma_p = math.sqrt(float(max(w_expo.T @ cov @ w_expo, 0.0)))
232
+ return beta_p, er_capm, sigma_p
233
+
234
+ # ==============================
235
+ # Plot CML with CAPM point
236
+ # ==============================
237
+ def plot_cml(rf_ann: float, erp_ann: float, sigma_mkt: float,
238
+ user_beta: float,
239
+ suggestion: Optional[Dict] = None) -> Image.Image:
240
+ fig = plt.figure(figsize=(6.4, 4.2), dpi=120)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  slope = erp_ann / max(sigma_mkt, 1e-12)
242
+ xmax = max(0.3, 2.0 * sigma_mkt)
243
+ xs = np.linspace(0.0, xmax, 180)
244
  cml = rf_ann + slope * xs
245
  plt.plot(_pct(xs), _pct(cml), label="CML via Market", linewidth=1.8)
246
 
247
+ # Risk-free & market
248
+ plt.scatter([_pct(0.0)], [_pct(rf_ann)], label="Risk-free", s=25)
249
+ plt.scatter([_pct(sigma_mkt)], [_pct(rf_ann + erp_ann)], label="Market", s=25)
 
250
 
251
+ # User CAPM point projected onto CML using sigma = |beta| * sigma_mkt
252
+ sig_user = abs(user_beta) * sigma_mkt
253
+ mu_user = capm_er(user_beta, rf_ann, erp_ann)
254
+ plt.scatter([_pct(sig_user)], [_pct(mu_user)], label="Your CAPM point", s=35)
255
 
256
+ # Optional suggestion point
257
+ if suggestion is not None:
258
+ plt.scatter([_pct(float(suggestion["sigma"]))],
259
+ [_pct(float(suggestion["er"]))],
260
+ label="Selected Suggestion", marker="D", s=35)
261
+
262
+ plt.xlabel("σ (annual, %)")
263
  plt.ylabel("Expected return (annual, %)")
264
  plt.legend(loc="best", fontsize=8)
265
  plt.tight_layout()
 
266
  buf = io.BytesIO()
267
  plt.savefig(buf, format="png")
268
  plt.close(fig)
269
  buf.seek(0)
270
  return Image.open(buf)
271
 
272
+ # ==============================
273
+ # Yahoo symbol search
274
+ # ==============================
275
+ def yahoo_search(query: str):
276
+ if not query or len(query.strip()) == 0:
277
+ return []
278
+ url = "https://query1.finance.yahoo.com/v1/finance/search"
279
+ params = {"q": query.strip(), "quotesCount": 10, "newsCount": 0}
280
+ headers = {"User-Agent": "Mozilla/5.0"}
281
+ try:
282
+ r = requests.get(url, params=params, headers=headers, timeout=10)
283
+ r.raise_for_status()
284
+ data = r.json()
285
+ out = []
286
+ for q in data.get("quotes", []):
287
+ sym = q.get("symbol")
288
+ name = q.get("shortname") or q.get("longname") or ""
289
+ exch = q.get("exchDisp") or ""
290
+ if sym and sym.isascii():
291
+ out.append({"symbol": sym, "name": name, "exchange": exch})
292
+ if not out:
293
+ out = [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n/a"}]
294
+ return out[:10]
295
+ except Exception:
296
+ return [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n/a"}]
297
 
298
+ def validate_tickers(symbols: List[str], years: int) -> List[str]:
299
+ base = list(dict.fromkeys([s for s in symbols if s]))
300
+ px = fetch_prices_monthly(base + MARKET_CANDIDATES, years)
301
+ ok = [s for s in base if s in px.columns]
302
+ return ok
 
 
 
 
 
 
303
 
304
+ # ==============================
305
+ # Synthetic dataset & suggestions
306
+ # ==============================
307
+ def synth_profile_text(beta: float, sigma: float, er: float, weights: Dict[str, float]) -> str:
308
+ top = sorted(weights.items(), key=lambda kv: -abs(kv[1]))[:8]
309
+ parts = [f"{k} {abs(v)*100:.1f}%" for k, v in top]
310
+ return (
311
+ f"portfolio with beta {beta:.2f}, volatility {sigma:.3f}, expected return {er:.3f}; "
312
+ f"holdings: " + ", ".join(parts)
313
+ )
314
 
315
+ def build_synthetic_dataset(universe: List[str],
316
+ rf_ann: float,
317
+ erp_ann: float,
318
+ betas: Dict[str, float],
319
+ covA: pd.DataFrame,
320
+ n_rows: int = 1000,
321
+ seed: int = 123) -> pd.DataFrame:
322
+ rng = np.random.default_rng(seed)
323
+ rows = []
324
+ assets = [t for t in universe] # long-only samples
325
+ for i in range(n_rows):
326
+ k = rng.integers(low=max(2, min(2, len(assets))), high=max(3, min(8, len(assets))) + 1)
327
+ picks = list(rng.choice(assets, size=min(k, len(assets)), replace=False))
328
+ raw = rng.dirichlet(np.ones(len(picks)))
329
+ wmap = {picks[j]: float(raw[j]) for j in range(len(picks))}
330
+ beta_p, er_capm, sigma_p = portfolio_stats(wmap, covA, betas, rf_ann, erp_ann)
331
  rows.append({
 
 
332
  "tickers": ",".join(picks),
333
+ "weights": ",".join(f"{wmap[t]:.6f}" for t in picks),
334
+ "beta": beta_p,
335
+ "er": er_capm,
336
+ "sigma": sigma_p,
337
+ "desc": synth_profile_text(beta_p, sigma_p, er_capm, wmap),
338
  })
339
+ df = pd.DataFrame(rows)
340
+ return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
+ def get_embedding_model():
343
+ global _EMB_MODEL
344
+ if _EMB_MODEL is None:
345
+ _EMB_MODEL = SentenceTransformer("FinLang/finance-embeddings-investopedia")
346
+ return _EMB_MODEL
347
+
348
+ def encode_texts(texts: List[str]):
349
+ model = get_embedding_model()
350
+ return model.encode(texts, normalize_embeddings=True)
351
+
352
+ def cosine_sim(a: np.ndarray, b: np.ndarray) -> np.ndarray:
353
+ return (a @ b.T)
354
+
355
+ def select_bucket_candidates(df: pd.DataFrame, bucket: str) -> pd.DataFrame:
356
+ # bucket by sigma tertiles
357
+ q1 = df["sigma"].quantile(1/3)
358
+ q2 = df["sigma"].quantile(2/3)
359
+ if bucket == "Low":
360
+ return df[df["sigma"] <= q1]
361
+ if bucket == "Medium":
362
+ return df[(df["sigma"] > q1) & (df["sigma"] <= q2)]
363
+ return df[df["sigma"] > q2]
364
+
365
+ def parse_weights(row: pd.Series) -> Dict[str, float]:
366
+ ts = [t.strip() for t in str(row["tickers"]).split(",")]
367
+ ws = [float(x) for x in str(row["weights"]).split(",")]
368
+ wmap = {ts[i]: ws[i] for i in range(min(len(ts), len(ws)))}
369
+ # normalize just in case
370
+ s = sum(abs(v) for v in wmap.values()) or 1.0
371
+ return {k: v / s for k, v in wmap.items()}
372
+
373
+ def pick_top3_for_bucket(df: pd.DataFrame, bucket: str) -> List[Dict]:
374
+ cand = select_bucket_candidates(df, bucket)
375
+ if cand.empty:
376
  return []
377
+ # Rank by embedding similarity to a short query
378
+ query_map = {
379
+ "Low": "low risk, stable portfolio, conservative volatility",
380
+ "Medium": "balanced risk portfolio, moderate volatility",
381
+ "High": "high risk, growth portfolio, higher volatility"
382
+ }
383
+ q = query_map[bucket]
384
+ embs_cand = encode_texts(cand["desc"].tolist())
385
+ emb_q = encode_texts([q])[0].reshape(1, -1)
386
+ sims = cosine_sim(emb_q, embs_cand).flatten()
387
+ order = np.argsort(-sims)
388
+ picks = []
389
+ for idx in order[:3]:
390
+ r = cand.iloc[int(idx)]
391
+ wmap = parse_weights(r)
392
+ picks.append({"weights": wmap, "beta": float(r["beta"]),
393
+ "er": float(r["er"]), "sigma": float(r["sigma"])})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  return picks
395
 
396
+ # ==============================
397
+ # Gradio callbacks
398
+ # ==============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
  def search_tickers_cb(q: str):
400
  hits = yahoo_search(q)
401
  if not hits:
 
405
 
406
  def add_symbol(selection: str, table: pd.DataFrame):
407
  if not selection:
408
+ return table, "Pick a row from Matches first", gr.update(value=None)
409
  symbol = selection.split("|")[0].strip().upper()
410
  current = [] if table is None or len(table) == 0 else [str(x).upper() for x in table["ticker"].tolist() if str(x) != "nan"]
411
  tickers = current if symbol in current else current + [symbol]
 
 
412
  val = validate_tickers(tickers, years=DEFAULT_LOOKBACK_YEARS)
413
  tickers = [t for t in tickers if t in val]
414
  amt_map = {}
 
422
  if len(new_table) > MAX_TICKERS:
423
  new_table = new_table.iloc[:MAX_TICKERS]
424
  msg = f"Reached max of {MAX_TICKERS}"
425
+ return new_table, msg, gr.update(value=None) # also clears dropdown
426
 
427
  def lock_ticker_column(tb: pd.DataFrame):
428
  if tb is None or len(tb) == 0:
 
439
  code = fred_series_for_horizon(y)
440
  rf = fetch_fred_yield_annual(code)
441
  global HORIZON_YEARS, RF_CODE, RF_ANN
442
+ HORIZON_YEARS = y
443
  RF_CODE = code
444
  RF_ANN = rf
445
+ return f"Risk-free series {code}. Latest annual rate {rf:.2%}."
446
+
447
+ def build_summary_md(lookback, rf_code, rf, erp, sigma_mkt,
448
+ beta_p, er_capm, sigma_cml_user,
449
+ market_sym) -> str:
450
+ lines = []
451
+ lines.append("### Inputs")
452
+ lines.append(f"- Lookback years {lookback}")
453
+ lines.append(f"- Horizon years {int(round(HORIZON_YEARS))}")
454
+ lines.append(f"- Risk-free {fmt_pct(rf)} from {rf_code}")
455
+ lines.append(f"- Market ERP {fmt_pct(erp)}")
456
+ lines.append(f"- Market σ {fmt_pct(sigma_mkt)} (proxy: {market_sym})")
457
+ lines.append("")
458
+ lines.append("### Your portfolio (CAPM)")
459
+ lines.append(f"- Beta {beta_p:.2f}")
460
+ lines.append(f"- Expected return (CAPM / SML) {fmt_pct(er_capm)}")
461
+ lines.append(f"- σ on CML for your beta (|β|×σ_mkt) {fmt_pct(sigma_cml_user)}")
462
+ return "\n".join(lines)
463
+
464
+ def pack_suggestion_table(pick: Dict, gross_usd: float) -> pd.DataFrame:
465
+ rows = []
466
+ for t, w in sorted(pick["weights"].items(), key=lambda kv: -kv[1]):
467
+ rows.append({
468
+ "ticker": t,
469
+ "weight_pct": float(w) * 100.0,
470
+ "amount_usd": float(w) * float(gross_usd)
471
+ })
472
+ return pd.DataFrame(rows, columns=SUG_COLS)
473
+
474
+ def suggestion_metrics_md(pick: Dict) -> str:
475
+ return (
476
+ f"**Suggested portfolio** \n"
477
+ f"- Expected return (CAPM) {fmt_pct(pick['er'])} \n"
478
+ f"- σ (annual) {fmt_pct(pick['sigma'])} \n"
479
+ f"- Beta {pick['beta']:.2f}"
480
+ )
481
+
482
+ def compute(years_lookback: int,
483
+ table: pd.DataFrame,
484
+ risk_choice: str,
485
+ pick_choice: str):
486
+ # ---------- sanitize input table ----------
487
  if table is None or len(table) == 0:
488
+ return None, "Add at least one ticker.", "Universe empty", empty_positions_df(), {}, gr.update(), gr.update(), "", empty_suggest_df()
489
 
490
  df = table.dropna()
491
  df["ticker"] = df["ticker"].astype(str).str.upper().str.strip()
492
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
493
  symbols = [t for t in df["ticker"].tolist() if t]
 
 
494
 
495
  symbols = validate_tickers(symbols, years_lookback)
496
  if len(symbols) == 0:
497
+ return None, "Could not validate any tickers.", "Universe invalid", empty_positions_df(), {}, gr.update(), gr.update(), "", empty_suggest_df()
 
 
498
 
499
+ # ---------- amounts & weights ----------
500
+ amounts = {r["ticker"]: float(r["amount_usd"]) for _, r in df.iterrows() if r["ticker"] in symbols}
501
  gross = sum(abs(v) for v in amounts.values())
502
+ if gross == 0:
503
+ return None, "All amounts are zero.", "Universe ok", empty_positions_df(), {}, gr.update(), gr.update(), "", empty_suggest_df()
504
+ weights_user = {k: v / gross for k, v in amounts.items()}
505
 
506
+ # ---------- risk-free & moments ----------
507
  rf_ann = RF_ANN
508
  moms = estimate_all_moments_aligned(symbols, years_lookback, rf_ann)
509
+ betas, covA = moms["betas"], moms["cov_ann"]
510
+ erp_ann, sigma_mkt, market_sym = moms["erp_ann"], moms["sigma_m_ann"], moms["market"]
511
 
512
+ # ---------- user stats (CAPM) ----------
513
+ beta_p, er_capm, _sigma_hist = portfolio_stats(weights_user, covA, betas, rf_ann, erp_ann)
514
+ sigma_user_on_cml = abs(beta_p) * sigma_mkt # plotted, ensures point on CML
515
 
516
+ # ---------- positions table ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  rows = []
518
  for t in symbols:
 
519
  rows.append({
520
  "ticker": t,
521
  "amount_usd": amounts.get(t, 0.0),
522
+ "weight_exposure": weights_user.get(t, 0.0),
523
+ "beta": 1.0 if abs(betas.get(t, 0.0) - 1.0) < 1e-9 else betas.get(t, np.nan)
524
  })
525
  pos_table = pd.DataFrame(rows, columns=POS_COLS)
526
 
527
+ # ---------- synthetic dataset ----------
528
+ ensure_data_dir()
529
+ synth_df = build_synthetic_dataset(
530
+ universe=list(sorted(set(symbols))),
531
+ rf_ann=rf_ann,
532
+ erp_ann=erp_ann,
533
+ betas=betas,
534
+ covA=covA,
535
+ n_rows=1000,
536
+ seed=123
537
+ )
538
+ try:
539
+ synth_df.to_csv(DATASET_PATH, index=False)
540
+ except Exception:
541
+ pass
542
+
543
+ # ---------- pick 3 per bucket using embeddings ----------
544
+ low3 = pick_top3_for_bucket(synth_df, "Low")
545
+ med3 = pick_top3_for_bucket(synth_df, "Medium")
546
+ high3 = pick_top3_for_bucket(synth_df, "High")
547
+
548
+ # ---------- build state ----------
549
+ state = {
550
+ "gross": float(gross),
551
+ "picks": {"Low": low3, "Medium": med3, "High": high3},
552
+ "rf": float(rf_ann),
553
+ "erp": float(erp_ann),
554
+ "sigma_mkt": float(sigma_mkt),
555
+ "user_beta": float(beta_p)
556
+ }
557
+
558
+ # ---------- decide which suggestion to show initially ----------
559
+ risk = risk_choice if risk_choice in ("Low", "Medium", "High") else "Medium"
560
+ pick_idx = 0 if pick_choice not in ("Pick #1", "Pick #2", "Pick #3") else ["Pick #1", "Pick #2", "Pick #3"].index(pick_choice)
561
+ picks_list = state["picks"].get(risk, [])
562
+ pick = picks_list[pick_idx] if pick_idx < len(picks_list) else (picks_list[0] if picks_list else None)
563
+
564
+ # ---------- plot ----------
565
+ img = plot_cml(rf_ann, erp_ann, sigma_mkt, beta_p, suggestion=pick)
566
+
567
+ # ---------- summary ----------
568
+ info = build_summary_md(
569
+ years_lookback, RF_CODE, rf_ann, erp_ann, sigma_mkt,
570
+ beta_p, er_capm, sigma_user_on_cml, market_sym
571
  )
 
572
 
573
+ # ---------- suggestion UI ----------
574
+ risk_update = gr.update(choices=["Low", "Medium", "High"], value=risk)
575
+ pick_update = gr.update(choices=["Pick #1", "Pick #2", "Pick #3"], value="Pick #1")
576
+
577
+ if pick is None:
578
+ return img, info, f"Universe set to {', '.join(sorted(symbols))}", pos_table, state, risk_update, pick_update, "No suggestions available.", empty_suggest_df()
579
+
580
+ sug_md = suggestion_metrics_md(pick)
581
+ sug_table = pack_suggestion_table(pick, gross)
582
+
583
+ return img, info, f"Universe set to {', '.join(sorted(symbols))}", pos_table, state, risk_update, pick_update, sug_md, sug_table
584
+
585
+ def update_suggestion(risk: str, pick_name: str, state: dict):
586
+ if not state or "picks" not in state:
587
+ return gr.update(), "", empty_suggest_df()
588
+ picks_list = state["picks"].get(risk, [])
589
+ if not picks_list:
590
+ return gr.update(), "No suggestions for this bucket.", empty_suggest_df()
591
+ idx = ["Pick #1", "Pick #2", "Pick #3"].index(pick_name) if pick_name in ("Pick #1", "Pick #2", "Pick #3") else 0
592
+ idx = min(idx, len(picks_list) - 1)
593
+ pick = picks_list[idx]
594
+ img = plot_cml(state["rf"], state["erp"], state["sigma_mkt"], state["user_beta"], suggestion=pick)
595
+ sug_md = suggestion_metrics_md(pick)
596
+ sug_table = pack_suggestion_table(pick, state.get("gross", 0.0))
597
+ return img, sug_md, sug_table
598
+
599
+ # ==============================
600
+ # Build UI
601
+ # ==============================
602
+ ensure_data_dir()
603
+ RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
604
+ RF_ANN = fetch_fred_yield_annual(RF_CODE)
605
 
606
+ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
607
  gr.Markdown(
608
  "## Efficient Portfolio Advisor\n"
609
+ "Search symbols, enter **dollar amounts**, set horizon. "
610
+ "Returns are from Yahoo Finance (monthly). Risk-free is from FRED. "
611
+ "Plot shows **CAPM point on the CML** (no historical returns plotted)."
612
  )
613
 
614
  with gr.Row():
615
  with gr.Column(scale=1):
616
  q = gr.Textbox(label="Search symbol")
617
  search_note = gr.Markdown()
618
+ matches = gr.Dropdown(choices=[], label="Matches", allow_custom_value=True)
619
+ search_btn = gr.Button("Search")
620
+ add_btn = gr.Button("Add selected to portfolio")
 
621
 
622
+ gr.Markdown("### Portfolio positions (enter $ amounts; negatives allowed for shorts)")
623
  table = gr.Dataframe(
624
  headers=["ticker", "amount_usd"],
625
  datatype=["str", "number"],
 
628
  )
629
 
630
  horizon = gr.Number(label="Horizon in years (1–100)", value=HORIZON_YEARS, precision=0)
631
+ lookback = gr.Slider(1, 10, value=DEFAULT_LOOKBACK_YEARS, step=1, label="Lookback years for betas & covariances")
 
 
 
 
632
 
633
+ run_btn = gr.Button("Compute")
634
 
635
  with gr.Column(scale=1):
636
+ plot = gr.Image(label="Capital Market Line (CAPM)", type="pil")
637
+ summary = gr.Markdown(label="Summary")
638
  universe_msg = gr.Textbox(label="Universe status", interactive=False)
639
 
640
  positions = gr.Dataframe(
 
646
  interactive=False
647
  )
648
 
649
+ gr.Markdown("### Dataset-based suggestions (choose risk bucket and pick)")
650
+ state = gr.State({})
651
+ risk_selector = gr.Radio(choices=["Low", "Medium", "High"], value="Medium", label="Risk bucket to view")
652
+ pick_selector = gr.Radio(choices=["Pick #1", "Pick #2", "Pick #3"], value="Pick #1", label="Suggestion")
653
+ sugg_metrics = gr.Markdown(label="Suggestion metrics")
654
+ suggestions = gr.Dataframe(
655
+ label="Suggested holdings",
656
+ headers=SUG_COLS,
657
  datatype=["str", "number", "number"],
658
+ col_count=(len(SUG_COLS), "fixed"),
659
  value=empty_suggest_df(),
660
  interactive=False
661
  )
 
662
 
663
+ # --- wiring ---
 
 
 
664
  def do_search(query):
665
  note, options = search_tickers_cb(query)
666
+ # Clear previous selection to avoid “not in choices
667
+ return note, gr.update(choices=options, value=None)
668
 
669
  search_btn.click(fn=do_search, inputs=q, outputs=[search_note, matches])
670
+ add_btn.click(fn=add_symbol, inputs=[matches, table], outputs=[table, search_note, matches])
671
  table.change(fn=lock_ticker_column, inputs=table, outputs=table)
672
  horizon.change(fn=set_horizon, inputs=horizon, outputs=universe_msg)
673
 
674
  run_btn.click(
675
  fn=compute,
676
+ inputs=[lookback, table, risk_selector, pick_selector],
677
+ outputs=[plot, summary, universe_msg, positions, state, risk_selector, pick_selector, sugg_metrics, suggestions]
678
  )
679
 
680
+ # Update suggestion view without recomputing moments
681
+ risk_selector.change(
682
+ fn=update_suggestion,
683
+ inputs=[risk_selector, pick_selector, state],
684
+ outputs=[plot, sugg_metrics, suggestions]
685
+ )
686
+ pick_selector.change(
687
+ fn=update_suggestion,
688
+ inputs=[risk_selector, pick_selector, state],
689
+ outputs=[plot, sugg_metrics, suggestions]
690
  )
691
 
692
  if __name__ == "__main__":