Tulitula commited on
Commit
9c2fb56
·
verified ·
1 Parent(s): 7785336

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +337 -434
app.py CHANGED
@@ -1,7 +1,5 @@
1
- import os
2
- import io
3
- import math
4
- import warnings
5
  warnings.filterwarnings("ignore")
6
 
7
  from typing import List, Tuple, Dict, Optional
@@ -9,75 +7,62 @@ from typing import List, Tuple, Dict, Optional
9
  import numpy as np
10
  import pandas as pd
11
  import matplotlib.pyplot as plt
12
- import gradio as gr
13
  from PIL import Image
 
 
14
  import requests
15
  import yfinance as yf
16
 
17
- # Embeddings
18
- from sentence_transformers import SentenceTransformer, util
 
 
 
 
 
 
 
 
 
19
 
20
  # ---------------- config ----------------
21
  DATA_DIR = "data"
22
- DATASET_PATH = os.path.join(DATA_DIR, "investor_profiles.csv")
23
 
24
- MAX_TICKERS = 30
25
  DEFAULT_LOOKBACK_YEARS = 5
26
- MARKET_TICKER = "VOO" # will auto-fallback to SPY if VOO missing
 
27
 
 
28
  POS_COLS = ["ticker", "amount_usd", "weight_exposure", "beta"]
29
- SUG_COLS = ["ticker", "suggested_weight_pct"]
30
 
 
31
  FRED_MAP = [
32
- (1, "DGS1"),
33
- (2, "DGS2"),
34
- (3, "DGS3"),
35
- (5, "DGS5"),
36
- (7, "DGS7"),
37
- (10, "DGS10"),
38
- (20, "DGS20"),
39
- (30, "DGS30"),
40
- (100, "DGS30"),
41
  ]
42
 
43
- # Embedding model cfg
44
- EMB_MODEL_NAME = "FinLang/finance-embeddings-investopedia"
45
-
46
- # ---------------- globals (runtime) ----------------
47
  HORIZON_YEARS = 5.0
48
  RF_CODE = "DGS5"
49
- RF_ANN = 0.03
50
-
51
- UNIVERSE: List[str] = [MARKET_TICKER, "QQQ", "XLK", "XLP", "XLE", "VNQ", "IEF", "HYG", "GLD", "EEM"]
52
 
53
- LAST_DATASET_PATH: Optional[str] = None
54
- LAST_UNIVERSE: Optional[List[str]] = None
55
- LAST_PLOT_STATE: Optional[Dict[str, float]] = None
56
-
57
- # embedding caches
58
- _EMB_MODEL = None
59
- _DS_TEXTS = None
60
- _DS_EMBS = None
61
- _DS_CACHE_KEY = None # (csv_path, tuple(universe))
62
-
63
-
64
- # ---------------- helpers ----------------
65
  def ensure_data_dir():
66
  os.makedirs(DATA_DIR, exist_ok=True)
67
 
 
 
 
 
68
 
69
- def empty_positions_df():
70
- return pd.DataFrame(columns=POS_COLS)
71
-
72
-
73
- def empty_suggest_df():
74
- return pd.DataFrame(columns=SUG_COLS)
75
-
76
-
77
  def fmt_pct(x: float) -> str:
78
  return f"{x*100:.2f}%"
79
 
80
-
81
  def fred_series_for_horizon(years: float) -> str:
82
  y = max(1.0, min(100.0, float(years)))
83
  for cutoff, code in FRED_MAP:
@@ -85,7 +70,6 @@ def fred_series_for_horizon(years: float) -> str:
85
  return code
86
  return "DGS30"
87
 
88
-
89
  def fetch_fred_yield_annual(code: str) -> float:
90
  url = f"https://fred.stlouisfed.org/graph/fredgraph.csv?id={code}"
91
  try:
@@ -97,8 +81,7 @@ def fetch_fred_yield_annual(code: str) -> float:
97
  except Exception:
98
  return 0.03
99
 
100
-
101
- # -------- Yahoo symbol search ----------
102
  def yahoo_search(query: str):
103
  if not query or len(query.strip()) == 0:
104
  return []
@@ -117,169 +100,93 @@ def yahoo_search(query: str):
117
  if sym and sym.isascii():
118
  out.append({"symbol": sym, "name": name, "exchange": exch})
119
  if not out:
120
- out = [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n a"}]
121
  return out[:10]
122
  except Exception:
123
- return [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n a"}]
124
-
125
-
126
- # --------- prices / returns ----------
127
- def _extract_close(df: pd.DataFrame, tickers: List[str]) -> pd.DataFrame:
128
- """
129
- Robustly extract a (date x ticker) Close DataFrame regardless of yf's column layout.
130
- """
131
- if isinstance(df.columns, pd.MultiIndex):
132
- lv0 = df.columns.get_level_values(0)
133
- lv1 = df.columns.get_level_values(1)
134
- if "Close" in lv0:
135
- close = df["Close"]
136
- elif "Adj Close" in lv0:
137
- close = df["Adj Close"]
138
- elif "Close" in lv1:
139
- close = df.xs("Close", level=1, axis=1)
140
- elif "Adj Close" in lv1:
141
- close = df.xs("Adj Close", level=1, axis=1)
142
- else:
143
- # fallback: if first level are tickers
144
- # try to select 'Close' under each
145
- try:
146
- close = df.xs("Close", level=1, axis=1)
147
- except Exception:
148
- close = df.copy()
149
- else:
150
- # Single ticker case
151
- if "Close" in df.columns:
152
- s = df["Close"].copy()
153
- elif "Adj Close" in df.columns:
154
- s = df["Adj Close"].copy()
155
- else:
156
- # last resort: take any one numeric column
157
- s = df.select_dtypes(include=[np.number]).iloc[:, 0]
158
- # ensure column named as ticker
159
- name = tickers[0] if len(tickers) else "T0"
160
- close = s.to_frame(name=name)
161
-
162
- # Reindex columns to requested order where possible
163
- # If some symbols missing, they simply won't be present
164
- close = close.dropna(how="all").ffill()
165
- # Keep only requested tickers, in order
166
- cols = [c for c in tickers if c in close.columns]
167
- if not cols: # if nothing matched, keep whatever is there
168
- close = close.copy()
169
- else:
170
- close = close[cols]
171
- return close
172
-
173
 
174
  def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
175
  start = pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=years, days=7)
176
  end = pd.Timestamp.today(tz="UTC")
177
- dl = yf.download(
178
  list(dict.fromkeys(tickers)),
179
- start=start.date(),
180
- end=end.date(),
181
- interval="1mo",
182
- auto_adjust=True,
183
- progress=False
184
- )
185
- close = _extract_close(dl, tickers)
186
- return close
187
-
188
 
189
  def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
190
- return prices.pct_change().dropna(how="all")
191
-
192
-
193
- def annualize_mean(m):
194
- return np.asarray(m, dtype=float) * 12.0
195
-
196
-
197
- def annualize_sigma(s):
198
- return np.asarray(s, dtype=float) * math.sqrt(12.0)
199
-
200
 
201
  def validate_tickers(symbols: List[str], years: int) -> List[str]:
202
- uniq = list(dict.fromkeys(symbols))
203
- df = fetch_prices_monthly(uniq, years)
204
- ok = [s for s in uniq if s in df.columns]
 
205
  return ok
206
 
207
-
208
- # -------------- aligned moments --------------
209
  def get_aligned_monthly_returns(symbols: List[str], years: int) -> pd.DataFrame:
210
- uniq = [c for c in dict.fromkeys(symbols) if c]
211
- tickers = uniq.copy()
212
-
213
- # Ensure market present (try MARKET_TICKER then fallback to SPY)
214
- market_ok = MARKET_TICKER in tickers
215
- if not market_ok:
216
- tickers.append(MARKET_TICKER)
217
-
218
  px = fetch_prices_monthly(tickers, years)
219
- if MARKET_TICKER not in px.columns:
220
- # fallback to SPY if VOO missing
221
- if "SPY" not in tickers:
222
- tickers.append("SPY")
223
- px2 = fetch_prices_monthly(tickers, years)
224
- if "SPY" in px2.columns:
225
- px = px2
226
- else:
227
- pass # keep px as-is
228
-
229
  rets = monthly_returns(px)
230
- keep = [c for c in uniq if c in rets.columns]
231
- if MARKET_TICKER in rets.columns:
232
- keep += [MARKET_TICKER]
233
- elif "SPY" in rets.columns:
234
- keep += ["SPY"]
235
- R = rets[keep].dropna(how="any")
236
  return R.loc[:, ~R.columns.duplicated()]
237
 
238
-
239
  def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
240
- R = get_aligned_monthly_returns(symbols, years)
241
- mkt_col = MARKET_TICKER if MARKET_TICKER in R.columns else ("SPY" if "SPY" in R.columns else None)
242
- if mkt_col is None or R.shape[0] < 3:
243
- raise ValueError("Not enough aligned data including market")
244
 
245
  rf_m = rf_ann / 12.0
246
- m = R[mkt_col]
 
 
247
  if isinstance(m, pd.DataFrame):
248
  m = m.iloc[:, 0].squeeze()
249
 
250
- mu_m_ann = float(annualize_mean(m.mean()))
251
- sigma_m_ann = float(annualize_sigma(m.std(ddof=1)))
252
  erp_ann = float(mu_m_ann - rf_ann)
253
 
254
  ex_m = m - rf_m
255
  var_m = float(np.var(ex_m.values, ddof=1))
256
- var_m = max(var_m, 1e-8)
257
 
 
258
  betas: Dict[str, float] = {}
259
- for s in [c for c in R.columns if c != mkt_col]:
 
 
 
260
  ex_s = R[s] - rf_m
261
- betas[s] = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1] / var_m)
262
- betas[mkt_col] = 1.0 # definition
263
-
264
- asset_cols = [c for c in R.columns if c != mkt_col]
265
- cov_m = np.cov(R[asset_cols].values.T, ddof=1) if asset_cols else np.zeros((0, 0))
266
- covA = pd.DataFrame(cov_m * 12.0, index=asset_cols, columns=asset_cols)
267
-
268
- return {"betas": betas, "cov_ann": covA, "erp_ann": erp_ann, "sigma_m_ann": sigma_m_ann, "mkt_col": mkt_col}
 
 
269
 
 
270
 
271
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
272
  return float(rf_ann + beta * erp_ann)
273
 
274
-
275
  def portfolio_stats(weights: Dict[str, float],
276
  cov_ann: pd.DataFrame,
277
  betas: Dict[str, float],
278
  rf_ann: float,
279
  erp_ann: float) -> Tuple[float, float, float]:
280
  tickers = list(weights.keys())
281
- if len(tickers) == 0:
282
- return 0.0, 0.0, 0.0
283
  w = np.array([weights[t] for t in tickers], dtype=float)
284
  gross = float(np.sum(np.abs(w)))
285
  if gross == 0:
@@ -291,58 +198,67 @@ def portfolio_stats(weights: Dict[str, float],
291
  sigma_p = math.sqrt(float(max(w_expo.T @ cov @ w_expo, 0.0)))
292
  return beta_p, er_p, sigma_p
293
 
294
-
295
- # -------------- CML helpers --------------
296
  def efficient_same_sigma(sigma_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
297
  if sigma_mkt <= 1e-12:
298
  return 0.0, 1.0, rf_ann
299
  a = sigma_target / sigma_mkt
300
- return a, 1.0 - a, rf_ann + a * erp_ann
301
-
302
 
303
  def efficient_same_return(mu_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
304
  if abs(erp_ann) <= 1e-12:
305
  return 0.0, 1.0, rf_ann
306
  a = (mu_target - rf_ann) / erp_ann
307
- return a, 1.0 - a, abs(a) * sigma_mkt
308
 
309
-
310
- def plot_cml(
311
- rf_ann, erp_ann, sigma_mkt,
312
- pt_sigma, pt_mu,
313
- same_sigma_sigma, same_sigma_mu,
314
- same_mu_sigma, same_mu_mu,
315
- targ_sigma=None, targ_mu=None
316
- ) -> Image.Image:
317
  fig = plt.figure(figsize=(6, 4), dpi=120)
318
 
319
  xmax = max(
320
  0.3,
321
  sigma_mkt * 2.0,
322
  pt_sigma * 1.4,
323
- (same_mu_sigma or 0.0) * 1.4,
324
- (same_sigma_sigma or 0.0) * 1.4,
325
- (targ_sigma or 0.0) * 1.4,
326
  )
327
  xs = np.linspace(0, xmax, 160)
328
  slope = erp_ann / max(sigma_mkt, 1e-12)
329
  cml = rf_ann + slope * xs
330
- plt.plot(xs, cml, label="CML via Market", linewidth=2.0)
331
 
332
- # key points
333
  plt.scatter([0.0], [rf_ann], label="Risk-free (FRED)")
334
- plt.scatter([sigma_mkt], [rf_ann + erp_ann], label="Market")
335
- plt.scatter([pt_sigma], [pt_mu], label="Your portfolio", marker="D")
336
-
337
- if same_sigma_sigma is not None and same_sigma_mu is not None:
338
- plt.scatter([same_sigma_sigma], [same_sigma_mu], label="Efficient same sigma", marker="o")
339
- if same_mu_sigma is not None and same_mu_mu is not None:
340
- plt.scatter([same_mu_sigma], [same_mu_mu], label="Efficient same return", marker="o")
341
- if targ_sigma is not None and targ_mu is not None:
342
- plt.scatter([targ_sigma], [targ_mu], label="Suggestion", marker="X", s=70)
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
  plt.xlabel("σ (annualized)")
345
  plt.ylabel("Expected return (annual)")
 
 
346
  plt.legend(loc="best")
347
  plt.tight_layout()
348
 
@@ -352,195 +268,181 @@ def plot_cml(
352
  buf.seek(0)
353
  return Image.open(buf)
354
 
355
-
356
- def _overlay_plot_with_suggestion(sigma_sugg: Optional[float], mu_sugg: Optional[float]) -> Optional[Image.Image]:
357
- if LAST_PLOT_STATE is None:
358
- return None
359
- s = LAST_PLOT_STATE
360
- return plot_cml(
361
- s["rf_ann"], s["erp_ann"], s["sigma_mkt"],
362
- s["pt_sigma"], s["pt_mu"],
363
- s["pt_sigma"], s["mu_eff_sigma"],
364
- s["sigma_eff_mu"], s["pt_mu"],
365
- targ_sigma=sigma_sugg, targ_mu=mu_sugg
366
- )
367
-
368
-
369
- # -------------- synthetic dataset (1,000 rows over *current* universe) --------------
370
- def build_synthetic_dataset(universe: List[str], years: int, rf_ann: float, erp_ann: float) -> pd.DataFrame:
371
- # Always include market column used in cov/beta (if present)
372
- symbols = list(sorted(set([s for s in universe if s] )))[:MAX_TICKERS]
373
- moms = estimate_all_moments_aligned(symbols, years, rf_ann)
374
- covA, betas = moms["cov_ann"], moms["betas"]
375
-
376
- rows, rng = [], np.random.default_rng(123)
377
- n = 1000
378
- for i in range(n):
379
  k = rng.integers(low=min(2, len(symbols)), high=min(8, len(symbols)) + 1)
380
  picks = list(rng.choice(symbols, size=k, replace=False))
381
- signs = rng.choice([-1.0, 1.0], size=k, p=[0.20, 0.80])
382
  raw = rng.dirichlet(np.ones(k))
383
  gross = 1.0 + float(rng.gamma(2.0, 0.5))
384
  w = gross * signs * raw
385
- # compute stats from CAPM + cov
386
- beta_p, er_p, sigma_p = portfolio_stats({picks[j]: w[j] for j in range(k)}, covA, betas, rf_ann, erp_ann)
387
- rows.append({
 
388
  "id": i,
 
389
  "tickers": ",".join(picks),
390
- "weights": ",".join(f"{x:.6f}" for x in w),
 
391
  "er_p": er_p,
392
- "sigma_p": sigma_p,
393
- "beta_p": beta_p
394
  })
 
395
 
396
- return pd.DataFrame(rows)
397
-
398
-
399
- def save_synth_csv(df: pd.DataFrame, path: str = DATASET_PATH):
400
  os.makedirs(os.path.dirname(path), exist_ok=True)
401
  df.to_csv(path, index=False)
402
 
 
 
 
 
 
 
 
 
 
 
 
 
403
 
404
- # ---------------- Embeddings helpers ----------------
405
- def _get_emb_model():
406
- global _EMB_MODEL
407
- if _EMB_MODEL is None:
408
- _EMB_MODEL = SentenceTransformer(EMB_MODEL_NAME)
409
- return _EMB_MODEL
410
-
411
-
412
- def _weights_top_phrase(universe, w, top=4):
413
- pairs = sorted([(universe[i], abs(float(w[i]))) for i in range(len(universe))],
414
- key=lambda t: -t[1])[:top]
415
- parts = [f"{t} {p*100:.1f}%" for t, p in pairs if p > 1e-4]
416
- return ", ".join(parts)
417
-
418
-
419
- def portfolio_to_sentence(universe, w, er, sigma, beta):
420
- return (f"portfolio with volatility {sigma*100:.2f} percent, "
421
- f"expected return {er*100:.2f} percent, beta {beta:.2f}, "
422
- f"weights mostly in {_weights_top_phrase(universe, w)}")
423
-
424
 
425
- def build_ds_embeddings(csv_path: str, universe: list):
426
- global _DS_TEXTS, _DS_EMBS, _DS_CACHE_KEY
427
- cache_key = (csv_path, tuple(universe))
428
- if _DS_EMBS is not None and _DS_CACHE_KEY == cache_key:
429
- return _DS_TEXTS, _DS_EMBS
 
 
 
 
430
 
431
- df = pd.read_csv(csv_path)
432
- texts = []
433
  rows = []
434
  for _, r in df.iterrows():
435
- ws = np.array([float(x) for x in str(r["weights"]).split(",")], dtype=float)
436
- ts = [t.strip().upper() for t in str(r["tickers"]).split(",")]
437
- wmap = {ts[i]: ws[i] for i in range(min(len(ts), len(ws)))}
438
- w = np.array([wmap.get(t, 0.0) for t in universe], dtype=float)
439
- g = np.sum(np.abs(w))
440
- if g <= 1e-12:
441
  continue
442
- w = w / g
443
- er = float(r["er_p"]); sigma = float(r["sigma_p"]); beta = float(r["beta_p"])
444
- txt = portfolio_to_sentence(universe, w, er, sigma, beta)
445
- texts.append(txt); rows.append((w, er, sigma, beta))
446
-
447
- model = _get_emb_model()
448
- embs = model.encode(texts, normalize_embeddings=True, show_progress_bar=False)
449
- _DS_TEXTS, _DS_EMBS, _DS_CACHE_KEY = (rows, embs, cache_key)
450
- return _DS_TEXTS, _DS_EMBS
451
-
452
-
453
- def pick_low_med_high(csv_path: str, universe: List[str]):
454
- df = pd.read_csv(csv_path)
455
- rows = []
456
- for _, r in df.iterrows():
457
  ws = [float(x) for x in str(r["weights"]).split(",")]
458
- ts = [t.strip().upper() for t in str(r["tickers"]).split(",")]
459
- wmap = {ts[i]: ws[i] for i in range(min(len(ts), len(ws)))}
460
- x = np.array([wmap.get(t, 0.0) for t in universe], dtype=float)
461
- g = float(np.sum(np.abs(x)))
462
- if g <= 1e-12:
463
- continue
464
- x = x / g
465
- rows.append((x, float(r["er_p"]), float(r["sigma_p"]), float(r["beta_p"])))
466
  if not rows:
467
- return None
468
- rows_sorted = sorted(rows, key=lambda t: t[2]) # by sigma
469
- return rows_sorted
470
-
471
-
472
- def _band_indices(n, level):
473
- if level == "low":
474
- return range(0, max(1, int(0.25 * n)))
475
- if level == "medium":
476
- a, b = int(0.375 * n), int(0.625 * n)
477
- return range(max(0, a), min(n, b))
478
- return range(max(0, int(0.75 * n)), n) # high
479
-
480
-
481
- def suggest_level(level: str):
482
- if not LAST_DATASET_PATH or not os.path.exists(LAST_DATASET_PATH) or not LAST_UNIVERSE:
483
- return empty_suggest_df(), "Run Compute first.", None
484
- rows_texts, embs = build_ds_embeddings(LAST_DATASET_PATH, LAST_UNIVERSE)
485
- if not rows_texts:
486
- return empty_suggest_df(), "No dataset rows.", None
487
-
488
- n = len(rows_texts)
489
- band = list(_band_indices(n, level))
490
- if not band:
491
- return empty_suggest_df(), "No rows in band.", None
492
-
493
- prompts = {
494
- "low": "conservative low-risk portfolio with low volatility and low beta",
495
- "medium": "balanced moderate-risk portfolio with moderate volatility and beta around 1",
496
- "high": "aggressive high-risk growth portfolio with high volatility and beta above 1",
497
- }
498
- q = prompts.get(level, "balanced portfolio")
499
- model = _get_emb_model()
500
- q_emb = model.encode([q], normalize_embeddings=True)
501
- band_embs = embs[band]
502
- sims = util.cos_sim(q_emb, band_embs).cpu().numpy()[0]
503
- best_idx_in_band = int(np.argmax(sims))
504
- x, er_p, sig_p, beta_p = rows_texts[band[best_idx_in_band]]
505
-
506
- rows_df = [{"ticker": LAST_UNIVERSE[i], "suggested_weight_pct": float(x[i]) * 100.0}
507
- for i in range(len(LAST_UNIVERSE))]
508
- df = pd.DataFrame(rows_df, columns=SUG_COLS).sort_values("suggested_weight_pct", ascending=False)
509
- msg = f"{level.capitalize()} risk (embedding-ranked) → ER {fmt_pct(er_p)}, Sigma {fmt_pct(sig_p)}, Beta {beta_p:.2f}"
510
- img = _overlay_plot_with_suggestion(sig_p, er_p)
511
- return df, msg, img
512
-
513
-
514
- # -------------- summary builder --------------
 
 
 
 
 
515
  def build_summary_md(lookback, horizon, rf, rf_code, erp, sigma_mkt,
516
  beta_p, er_p, sigma_p,
517
  a_sigma, b_sigma, mu_eff_sigma,
518
- a_mu, b_mu, sigma_eff_mu) -> str:
 
 
519
  lines = []
520
  lines.append("### Inputs")
521
- lines.append(f"- Lookback years **{lookback}**")
522
- lines.append(f"- Horizon years **{int(round(horizon))}**")
523
- lines.append(f"- Risk free **{fmt_pct(rf)}** from **{rf_code}**")
524
- lines.append(f"- Market ERP **{fmt_pct(erp)}**")
525
- lines.append(f"- Market σ **{fmt_pct(sigma_mkt)}**")
526
  lines.append("")
527
  lines.append("### Your portfolio (CAPM expectations)")
528
- lines.append(f"- Beta **{beta_p:.2f}**")
529
- lines.append(f"- σ **{fmt_pct(sigma_p)}**")
530
- lines.append(f"- Expected return **{fmt_pct(er_p)}**")
531
  lines.append("")
532
  lines.append("### Efficient alternatives on CML")
533
- lines.append("**Same σ as your portfolio**")
534
- lines.append(f"- Market weight **{a_sigma:.2f}**, Bills weight **{b_sigma:.2f}**")
535
- lines.append(f"- Expected return **{fmt_pct(mu_eff_sigma)}**")
536
  lines.append("")
537
- lines.append("**Same expected return as your portfolio**")
538
- lines.append(f"- Market weight **{a_mu:.2f}**, Bills weight **{b_mu:.2f}**")
539
- lines.append(f"- σ **{fmt_pct(sigma_eff_mu)}**")
 
 
540
  return "\n".join(lines)
541
 
542
-
543
- # -------------- gradio callbacks --------------
544
  def search_tickers_cb(q: str):
545
  hits = yahoo_search(q)
546
  if not hits:
@@ -548,7 +450,6 @@ def search_tickers_cb(q: str):
548
  opts = [f"{h['symbol']} | {h['name']} | {h['exchange']}" for h in hits]
549
  return "Select a symbol and click Add", opts
550
 
551
-
552
  def add_symbol(selection: str, table: pd.DataFrame):
553
  if not selection:
554
  return table, "Pick a row from Matches first"
@@ -570,7 +471,6 @@ def add_symbol(selection: str, table: pd.DataFrame):
570
  msg = f"Reached max of {MAX_TICKERS}"
571
  return new_table, msg
572
 
573
-
574
  def lock_ticker_column(tb: pd.DataFrame):
575
  if tb is None or len(tb) == 0:
576
  return pd.DataFrame(columns=["ticker", "amount_usd"])
@@ -581,7 +481,6 @@ def lock_ticker_column(tb: pd.DataFrame):
581
  amounts = amounts[:len(tickers)] + [0.0] * max(0, len(tickers) - len(amounts))
582
  return pd.DataFrame({"ticker": tickers, "amount_usd": amounts})
583
 
584
-
585
  def set_horizon(years: float):
586
  y = max(1.0, min(100.0, float(years)))
587
  code = fred_series_for_horizon(y)
@@ -590,38 +489,40 @@ def set_horizon(years: float):
590
  HORIZON_YEARS = y
591
  RF_CODE = code
592
  RF_ANN = rf
593
- return f"Risk free series {code}. Latest annual rate {rf:.2%}. Will be used for CAPM and CML."
594
-
595
-
596
- def compute(years_lookback: int, table: pd.DataFrame):
597
- if table is None or len(table) == 0:
598
- return None, "Add at least one ticker", "Universe empty", empty_positions_df(), empty_suggest_df(), None
599
 
 
 
 
 
 
600
  df = table.dropna()
601
  df["ticker"] = df["ticker"].astype(str).str.upper().str.strip()
602
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
603
 
604
  symbols = [t for t in df["ticker"].tolist() if t]
605
  if len(symbols) == 0:
606
- return None, "Add at least one ticker", "Universe empty", empty_positions_df(), empty_suggest_df(), None
607
 
608
  symbols = validate_tickers(symbols, years_lookback)
609
  if len(symbols) == 0:
610
- return None, "Could not validate any tickers", "Universe invalid", empty_positions_df(), empty_suggest_df(), None
611
 
612
- global UNIVERSE
613
- UNIVERSE = list(sorted(set(symbols)))[:MAX_TICKERS]
614
 
615
- df = df[df["ticker"].isin(symbols)].copy()
616
- amounts = {r["ticker"]: float(r["amount_usd"]) for _, r in df.iterrows()}
 
617
  rf_ann = RF_ANN
618
 
619
- moms = estimate_all_moments_aligned(UNIVERSE, years_lookback, rf_ann)
 
620
  betas, covA, erp_ann, sigma_mkt = moms["betas"], moms["cov_ann"], moms["erp_ann"], moms["sigma_m_ann"]
621
 
622
  gross = sum(abs(v) for v in amounts.values())
623
  if gross == 0:
624
- return None, "All amounts are zero", "Universe ok", empty_positions_df(), empty_suggest_df(), None
625
  weights = {k: v / gross for k, v in amounts.items()}
626
 
627
  beta_p, er_p, sigma_p = portfolio_stats(weights, covA, betas, rf_ann, erp_ann)
@@ -629,53 +530,72 @@ def compute(years_lookback: int, table: pd.DataFrame):
629
  a_sigma, b_sigma, mu_eff_sigma = efficient_same_sigma(sigma_p, rf_ann, erp_ann, sigma_mkt)
630
  a_mu, b_mu, sigma_eff_mu = efficient_same_return(er_p, rf_ann, erp_ann, sigma_mkt)
631
 
632
- img = plot_cml(
 
 
 
 
 
 
 
 
 
 
 
 
 
633
  rf_ann, erp_ann, sigma_mkt,
634
  sigma_p, er_p,
635
  sigma_p, mu_eff_sigma,
636
  sigma_eff_mu, er_p,
637
- targ_sigma=None, targ_mu=None
638
  )
639
 
 
640
  info = build_summary_md(
641
  years_lookback, HORIZON_YEARS, rf_ann, RF_CODE, erp_ann, sigma_mkt,
642
  beta_p, er_p, sigma_p,
643
  a_sigma, b_sigma, mu_eff_sigma,
644
- a_mu, b_mu, sigma_eff_mu
 
 
645
  )
646
 
 
647
  rows = []
648
- for t in UNIVERSE:
649
- beta_val = 1.0 if abs(betas.get(t, 0.0) - 1.0) < 1e-6 else betas.get(t, np.nan)
650
  rows.append({
651
  "ticker": t,
652
  "amount_usd": amounts.get(t, 0.0),
653
  "weight_exposure": weights.get(t, 0.0),
654
- "beta": beta_val,
655
  })
656
  pos_table = pd.DataFrame(rows, columns=POS_COLS)
657
 
658
- # build 1,000-row dataset over CURRENT universe
659
- synth_df = build_synthetic_dataset(UNIVERSE, years=DEFAULT_LOOKBACK_YEARS, rf_ann=rf_ann, erp_ann=erp_ann)
660
- save_synth_csv(synth_df, DATASET_PATH)
661
-
662
- # update globals for suggestion buttons
663
- global LAST_DATASET_PATH, LAST_UNIVERSE, LAST_PLOT_STATE
664
- LAST_DATASET_PATH = DATASET_PATH
665
- LAST_UNIVERSE = UNIVERSE.copy()
666
- LAST_PLOT_STATE = {
667
- "rf_ann": rf_ann, "erp_ann": erp_ann, "sigma_mkt": sigma_mkt,
668
- "pt_sigma": sigma_p, "pt_mu": er_p,
669
- "mu_eff_sigma": mu_eff_sigma, "sigma_eff_mu": sigma_eff_mu
670
- }
671
-
672
- uni_msg = f"Universe set to: {', '.join(UNIVERSE)} — dataset generated with 1,000 mixes."
673
- return img, info, uni_msg, pos_table, empty_suggest_df(), DATASET_PATH
674
 
 
 
675
 
676
- # -------------- UI --------------
677
  ensure_data_dir()
678
- # initial RF based on default horizon
 
679
  HORIZON_YEARS = 5.0
680
  RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
681
  RF_ANN = fetch_fred_yield_annual(RF_CODE)
@@ -684,8 +604,9 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
684
  gr.Markdown(
685
  "## Efficient Portfolio Advisor\n"
686
  "Search symbols, enter dollar amounts, set your horizon. "
687
- "Prices from Yahoo Finance. Risk-free from FRED. "
688
- "Low/Medium/High suggestions use embeddings over a 1,000-mix dataset generated from your current universe."
 
689
  )
690
 
691
  with gr.Row():
@@ -693,8 +614,9 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
693
  q = gr.Textbox(label="Search symbol")
694
  search_note = gr.Markdown()
695
  matches = gr.Dropdown(choices=[], label="Matches")
696
- search_btn = gr.Button("Search")
697
- add_btn = gr.Button("Add selected to portfolio")
 
698
 
699
  gr.Markdown("### Portfolio positions — type dollar amounts (negatives allowed for shorts)")
700
  table = gr.Dataframe(
@@ -704,41 +626,38 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
704
  col_count=(2, "fixed")
705
  )
706
 
707
- horizon = gr.Number(label="Horizon in years (1–100)", value=5, precision=0)
708
- lookback = gr.Slider(1, 10, value=DEFAULT_LOOKBACK_YEARS, step=1, label="Lookback years for beta and sigma")
709
 
710
- run_btn = gr.Button("Compute (build dataset & plot)")
 
 
711
 
712
- gr.Markdown("### Suggestions (dataset + embeddings)")
713
- with gr.Row():
714
- btn_low = gr.Button("Suggest LOW risk")
715
- btn_med = gr.Button("Suggest MEDIUM risk")
716
- btn_high = gr.Button("Suggest HIGH risk")
717
 
718
  with gr.Column(scale=1):
719
  plot = gr.Image(label="Capital Market Line (CML)", type="pil")
720
  summary = gr.Markdown(label="Summary")
721
- universe_msg = gr.Textbox(label="Status", interactive=False)
722
  positions = gr.Dataframe(
723
  label="Computed positions",
724
  headers=POS_COLS,
725
  datatype=["str", "number", "number", "number"],
726
  col_count=(len(POS_COLS), "fixed"),
727
- value=empty_positions_df(),
728
  interactive=False
729
  )
730
  suggestions = gr.Dataframe(
731
- label="Suggested portfolio (weights as % exposures)",
732
  headers=SUG_COLS,
733
- datatype=["str", "number"],
734
  col_count=(len(SUG_COLS), "fixed"),
735
- value=empty_suggest_df(),
736
  interactive=False
737
  )
738
- sugg_msg = gr.Textbox(label="Suggestion detail", interactive=False)
739
- dl = gr.File(label="Generated dataset (CSV)", value=None, visible=True)
740
 
741
- # wiring
742
  def do_search(query):
743
  note, options = search_tickers_cb(query)
744
  return note, gr.update(choices=options)
@@ -749,26 +668,10 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
749
  horizon.change(fn=set_horizon, inputs=horizon, outputs=universe_msg)
750
 
751
  run_btn.click(
752
- fn=compute,
753
- inputs=[lookback, table],
754
  outputs=[plot, summary, universe_msg, positions, suggestions, dl]
755
  )
756
 
757
- def do_low():
758
- df, msg, img = suggest_level("low")
759
- return df, msg, (img if img is not None else gr.update())
760
-
761
- def do_med():
762
- df, msg, img = suggest_level("medium")
763
- return df, msg, (img if img is not None else gr.update())
764
-
765
- def do_high():
766
- df, msg, img = suggest_level("high")
767
- return df, msg, (img if img is not None else gr.update())
768
-
769
- btn_low.click(fn=do_low, inputs=None, outputs=[suggestions, sugg_msg, plot])
770
- btn_med.click(fn=do_med, inputs=None, outputs=[suggestions, sugg_msg, plot])
771
- btn_high.click(fn=do_high, inputs=None, outputs=[suggestions, sugg_msg, plot])
772
-
773
  if __name__ == "__main__":
774
  demo.launch()
 
1
+ # app.py
2
+ import os, io, math, json, hashlib, warnings
 
 
3
  warnings.filterwarnings("ignore")
4
 
5
  from typing import List, Tuple, Dict, Optional
 
7
  import numpy as np
8
  import pandas as pd
9
  import matplotlib.pyplot as plt
10
+ from matplotlib.ticker import PercentFormatter
11
  from PIL import Image
12
+
13
+ import gradio as gr
14
  import requests
15
  import yfinance as yf
16
 
17
+ # Optional embeddings (lazy-loaded)
18
+ _EMBED_MODEL = None
19
+ def get_embed_model():
20
+ global _EMBED_MODEL
21
+ if _EMBED_MODEL is None:
22
+ try:
23
+ from sentence_transformers import SentenceTransformer
24
+ _EMBED_MODEL = SentenceTransformer("FinLang/finance-embeddings-investopedia")
25
+ except Exception as e:
26
+ _EMBED_MODEL = False
27
+ return _EMBED_MODEL
28
 
29
  # ---------------- config ----------------
30
  DATA_DIR = "data"
31
+ os.makedirs(DATA_DIR, exist_ok=True)
32
 
33
+ MARKET_TICKER = "VOO" # “market” proxy
34
  DEFAULT_LOOKBACK_YEARS = 5
35
+ MAX_TICKERS = 30
36
+ SYNTH_ROWS = 1000
37
 
38
+ # UI tables
39
  POS_COLS = ["ticker", "amount_usd", "weight_exposure", "beta"]
40
+ SUG_COLS = ["pick", "ticker", "weight_exposure", "er_%", "sigma_%", "beta"]
41
 
42
+ # FRED tenor map
43
  FRED_MAP = [
44
+ (1, "DGS1"), (2, "DGS2"), (3, "DGS3"),
45
+ (5, "DGS5"), (7, "DGS7"), (10, "DGS10"),
46
+ (20, "DGS20"), (30, "DGS30"), (100, "DGS30"),
 
 
 
 
 
 
47
  ]
48
 
49
+ # Session globals
 
 
 
50
  HORIZON_YEARS = 5.0
51
  RF_CODE = "DGS5"
52
+ RF_ANN = 0.02
 
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def ensure_data_dir():
55
  os.makedirs(DATA_DIR, exist_ok=True)
56
 
57
+ def dataset_path_for_universe(universe: List[str]) -> str:
58
+ # unique file per universe (order-independent)
59
+ key = hashlib.sha256((",".join(sorted(universe))).encode()).hexdigest()[:10]
60
+ return os.path.join(DATA_DIR, f"investor_profiles_{key}.csv")
61
 
62
+ # ---------------- tiny utils ----------------
 
 
 
 
 
 
 
63
  def fmt_pct(x: float) -> str:
64
  return f"{x*100:.2f}%"
65
 
 
66
  def fred_series_for_horizon(years: float) -> str:
67
  y = max(1.0, min(100.0, float(years)))
68
  for cutoff, code in FRED_MAP:
 
70
  return code
71
  return "DGS30"
72
 
 
73
  def fetch_fred_yield_annual(code: str) -> float:
74
  url = f"https://fred.stlouisfed.org/graph/fredgraph.csv?id={code}"
75
  try:
 
81
  except Exception:
82
  return 0.03
83
 
84
+ # ---------------- Yahoo search ----------------
 
85
  def yahoo_search(query: str):
86
  if not query or len(query.strip()) == 0:
87
  return []
 
100
  if sym and sym.isascii():
101
  out.append({"symbol": sym, "name": name, "exchange": exch})
102
  if not out:
103
+ out = [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n/a"}]
104
  return out[:10]
105
  except Exception:
106
+ return [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n/a"}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
109
  start = pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=years, days=7)
110
  end = pd.Timestamp.today(tz="UTC")
111
+ df = yf.download(
112
  list(dict.fromkeys(tickers)),
113
+ start=start.date(), end=end.date(),
114
+ interval="1mo", auto_adjust=True, progress=False
115
+ )["Close"]
116
+ if isinstance(df, pd.Series):
117
+ df = df.to_frame()
118
+ df = df.dropna(how="all").fillna(method="ffill")
119
+ return df
 
 
120
 
121
  def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
122
+ return prices.pct_change().dropna()
 
 
 
 
 
 
 
 
 
123
 
124
  def validate_tickers(symbols: List[str], years: int) -> List[str]:
125
+ ok, df = [], fetch_prices_monthly(list(set(symbols)), years)
126
+ for s in symbols:
127
+ if s in df.columns:
128
+ ok.append(s)
129
  return ok
130
 
131
+ # ---------------- moments (aligned) ----------------
 
132
  def get_aligned_monthly_returns(symbols: List[str], years: int) -> pd.DataFrame:
133
+ uniq = [c for c in dict.fromkeys(symbols) if c != MARKET_TICKER]
134
+ tickers = uniq + [MARKET_TICKER]
 
 
 
 
 
 
135
  px = fetch_prices_monthly(tickers, years)
 
 
 
 
 
 
 
 
 
 
136
  rets = monthly_returns(px)
137
+ cols = [c for c in uniq if c in rets.columns] + ([MARKET_TICKER] if MARKET_TICKER in rets.columns else [])
138
+ R = rets[cols].dropna(how="any")
 
 
 
 
139
  return R.loc[:, ~R.columns.duplicated()]
140
 
 
141
  def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
142
+ R = get_aligned_monthly_returns(symbols + [MARKET_TICKER], years)
143
+ if MARKET_TICKER not in R.columns or R.shape[0] < 3:
144
+ raise ValueError("Not enough aligned market data")
 
145
 
146
  rf_m = rf_ann / 12.0
147
+
148
+ # market series
149
+ m = R[MARKET_TICKER]
150
  if isinstance(m, pd.DataFrame):
151
  m = m.iloc[:, 0].squeeze()
152
 
153
+ mu_m_ann = float(m.mean() * 12.0)
154
+ sigma_m_ann = float(m.std(ddof=1) * math.sqrt(12.0))
155
  erp_ann = float(mu_m_ann - rf_ann)
156
 
157
  ex_m = m - rf_m
158
  var_m = float(np.var(ex_m.values, ddof=1))
159
+ var_m = max(var_m, 1e-10)
160
 
161
+ # betas for each asset (including market==1)
162
  betas: Dict[str, float] = {}
163
+ for s in R.columns:
164
+ if s == MARKET_TICKER:
165
+ betas[s] = 1.0
166
+ continue
167
  ex_s = R[s] - rf_m
168
+ cov_sm = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1])
169
+ betas[s] = float(cov_sm / var_m)
170
+
171
+ # IMPORTANT FIX: include MARKET in covariance so σ is never understated
172
+ asset_cols = list(R.columns)
173
+ if asset_cols:
174
+ cov_m = np.cov(R[asset_cols].values.T, ddof=1)
175
+ covA = pd.DataFrame(cov_m * 12.0, index=asset_cols, columns=asset_cols)
176
+ else:
177
+ covA = pd.DataFrame(np.zeros((0, 0)))
178
 
179
+ return {"betas": betas, "cov_ann": covA, "erp_ann": erp_ann, "sigma_m_ann": sigma_m_ann}
180
 
181
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
182
  return float(rf_ann + beta * erp_ann)
183
 
 
184
  def portfolio_stats(weights: Dict[str, float],
185
  cov_ann: pd.DataFrame,
186
  betas: Dict[str, float],
187
  rf_ann: float,
188
  erp_ann: float) -> Tuple[float, float, float]:
189
  tickers = list(weights.keys())
 
 
190
  w = np.array([weights[t] for t in tickers], dtype=float)
191
  gross = float(np.sum(np.abs(w)))
192
  if gross == 0:
 
198
  sigma_p = math.sqrt(float(max(w_expo.T @ cov @ w_expo, 0.0)))
199
  return beta_p, er_p, sigma_p
200
 
201
+ # ---------------- CML helpers & plot ----------------
 
202
  def efficient_same_sigma(sigma_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
203
  if sigma_mkt <= 1e-12:
204
  return 0.0, 1.0, rf_ann
205
  a = sigma_target / sigma_mkt
206
+ return a, 1 - a, rf_ann + a * erp_ann
 
207
 
208
  def efficient_same_return(mu_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
209
  if abs(erp_ann) <= 1e-12:
210
  return 0.0, 1.0, rf_ann
211
  a = (mu_target - rf_ann) / erp_ann
212
+ return a, 1 - a, abs(a) * sigma_mkt
213
 
214
+ def plot_cml_percent(rf_ann, erp_ann, sigma_mkt,
215
+ pt_sigma, pt_mu,
216
+ same_sigma_sigma, same_sigma_mu,
217
+ same_mu_sigma, same_mu_mu,
218
+ suggestion: Optional[Tuple[float, float]] = None) -> Image.Image:
 
 
 
219
  fig = plt.figure(figsize=(6, 4), dpi=120)
220
 
221
  xmax = max(
222
  0.3,
223
  sigma_mkt * 2.0,
224
  pt_sigma * 1.4,
225
+ same_sigma_sigma * 1.4,
226
+ same_mu_sigma * 1.4,
227
+ (suggestion[0] if suggestion else 0.0) * 1.5,
228
  )
229
  xs = np.linspace(0, xmax, 160)
230
  slope = erp_ann / max(sigma_mkt, 1e-12)
231
  cml = rf_ann + slope * xs
232
+ plt.plot(xs, cml, label="CML via Market")
233
 
234
+ # Points
235
  plt.scatter([0.0], [rf_ann], label="Risk-free (FRED)")
236
+ plt.scatter([sigma_mkt], [rf_ann + erp_ann], label=f"Market {MARKET_TICKER}")
237
+ plt.scatter([pt_sigma], [pt_mu], label="Your portfolio")
238
+ plt.scatter([same_sigma_sigma], [same_sigma_mu], label="Efficient same sigma")
239
+ plt.scatter([same_mu_sigma], [same_mu_mu], label="Efficient same return")
240
+ if suggestion is not None:
241
+ plt.scatter([suggestion[0]], [suggestion[1]], marker="X", s=70, label="Suggestion")
242
+
243
+ # Guides (percent annotated)
244
+ plt.plot([pt_sigma, same_sigma_sigma], [pt_mu, same_sigma_mu], ls="--", lw=1.0, alpha=0.7, c="gray")
245
+ d_ret = (same_sigma_mu - pt_mu) * 100.0
246
+ plt.annotate(f"Return gain at same σ {d_ret:+.2f}%",
247
+ xy=(same_sigma_sigma, same_sigma_mu),
248
+ xytext=(same_sigma_sigma, same_sigma_mu + 0.03),
249
+ arrowprops=dict(arrowstyle="->", lw=1.0), fontsize=9, ha="center")
250
+
251
+ plt.plot([pt_sigma, same_mu_sigma], [pt_mu, same_mu_mu], ls="--", lw=1.0, alpha=0.7, c="gray")
252
+ d_sig = (same_mu_sigma - pt_sigma) * 100.0
253
+ plt.annotate(f"Risk change at same μ {d_sig:+.2f}%",
254
+ xy=(same_mu_sigma, same_mu_mu),
255
+ xytext=(same_mu_sigma + 0.01, same_mu_mu),
256
+ arrowprops=dict(arrowstyle="->", lw=1.0), fontsize=9, va="center")
257
 
258
  plt.xlabel("σ (annualized)")
259
  plt.ylabel("Expected return (annual)")
260
+ plt.gca().xaxis.set_major_formatter(PercentFormatter(1.0))
261
+ plt.gca().yaxis.set_major_formatter(PercentFormatter(1.0))
262
  plt.legend(loc="best")
263
  plt.tight_layout()
264
 
 
268
  buf.seek(0)
269
  return Image.open(buf)
270
 
271
+ # ---------------- synthetic dataset ----------------
272
+ def synth_profile(seed: int) -> str:
273
+ rng = np.random.default_rng(seed)
274
+ risk = rng.choice(["cautious", "balanced", "moderate", "growth", "aggressive"])
275
+ horizon = rng.choice(["3y", "5y", "7y", "10y", "15y"])
276
+ goal = rng.choice(["retirement", "first home", "education", "wealth building", "travel", "emergency"])
277
+ return f"{risk} investor, {horizon} horizon, goal {goal}"
278
+
279
+ def build_synthetic_dataset(universe: List[str],
280
+ covA: pd.DataFrame,
281
+ betas: Dict[str, float],
282
+ rf_ann: float,
283
+ erp_ann: float,
284
+ rows: int = SYNTH_ROWS) -> pd.DataFrame:
285
+ # Ensure MARKET in universe (we may sample it too)
286
+ symbols = list(sorted(set(universe + [MARKET_TICKER])))[:MAX_TICKERS]
287
+ rng = np.random.default_rng(123)
288
+ data = []
289
+ for i in range(rows):
 
 
 
 
 
290
  k = rng.integers(low=min(2, len(symbols)), high=min(8, len(symbols)) + 1)
291
  picks = list(rng.choice(symbols, size=k, replace=False))
292
+ signs = rng.choice([-1.0, 1.0], size=k, p=[0.25, 0.75])
293
  raw = rng.dirichlet(np.ones(k))
294
  gross = 1.0 + float(rng.gamma(2.0, 0.5))
295
  w = gross * signs * raw
296
+ wmap = {picks[j]: w[j] for j in range(k)}
297
+
298
+ beta_p, er_p, sigma_p = portfolio_stats(wmap, covA, betas, rf_ann, erp_ann)
299
+ data.append({
300
  "id": i,
301
+ "profile_text": synth_profile(10_000 + i),
302
  "tickers": ",".join(picks),
303
+ "weights": ",".join(f"{x:.5f}" for x in w),
304
+ "beta_p": beta_p,
305
  "er_p": er_p,
306
+ "sigma_p": sigma_p
 
307
  })
308
+ return pd.DataFrame(data)
309
 
310
+ def save_synth_csv(df: pd.DataFrame, path: str):
 
 
 
311
  os.makedirs(os.path.dirname(path), exist_ok=True)
312
  df.to_csv(path, index=False)
313
 
314
+ def _row_to_exposures(row: pd.Series, universe: List[str]) -> Optional[np.ndarray]:
315
+ try:
316
+ ts = [t.strip() for t in str(row["tickers"]).split(",")]
317
+ ws = [float(x) for x in str(row["weights"]).split(",")]
318
+ wmap = {t: ws[i] for i, t in enumerate(ts) if i < len(ws)}
319
+ x = np.array([wmap.get(t, 0.0) for t in universe], dtype=float)
320
+ gross = float(np.sum(np.abs(x)))
321
+ if gross <= 1e-12:
322
+ return None
323
+ return x / gross
324
+ except Exception:
325
+ return None
326
 
327
+ def candidate_text(weights_map: Dict[str, float], er: float, sigma: float, beta: float) -> str:
328
+ top = sorted(weights_map.items(), key=lambda kv: -abs(kv[1]))[:6]
329
+ parts = [f"{k} {v:+.2f}" for k, v in top]
330
+ return (
331
+ f"portfolio with expected return {er:.4f}, volatility {sigma:.4f}, beta {beta:.2f}. "
332
+ f"top exposures: {'; '.join(parts)}"
333
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
+ def dataset_suggestions(csv_path: str,
336
+ universe: List[str],
337
+ risk_level: str,
338
+ use_embeddings: bool,
339
+ top_k: int = 3):
340
+ try:
341
+ df = pd.read_csv(csv_path)
342
+ except Exception:
343
+ return []
344
 
345
+ # Build rows usable for this universe
 
346
  rows = []
347
  for _, r in df.iterrows():
348
+ x = _row_to_exposures(r, universe)
349
+ if x is None:
 
 
 
 
350
  continue
351
+ # recover a printable mapping for display
352
+ ts = [t.strip() for t in str(r["tickers"]).split(",")]
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  ws = [float(x) for x in str(r["weights"]).split(",")]
354
+ wmap = {}
355
+ for i in range(min(len(ts), len(ws))):
356
+ wmap[ts[i]] = ws[i]
357
+ gross = sum(abs(v) for v in wmap.values()) or 1.0
358
+ wmap = {k: v / gross for k, v in wmap.items()}
359
+ rows.append((wmap, float(r["er_p"]), float(r["sigma_p"]), float(r["beta_p"])))
360
+
 
361
  if not rows:
362
+ return []
363
+
364
+ # Risk buckets by sigma
365
+ sigmas = np.array([r[2] for r in rows])
366
+ q10, q50, q90 = np.quantile(sigmas, [0.10, 0.50, 0.90])
367
+
368
+ if risk_level == "Low":
369
+ pool = [r for r in rows if r[2] <= q10]
370
+ target_sigma = q10
371
+ query = "low risk conservative stable portfolio minimize volatility"
372
+ elif risk_level == "High":
373
+ pool = [r for r in rows if r[2] >= q90]
374
+ target_sigma = q90
375
+ query = "high risk aggressive growth portfolio accept high volatility maximize returns"
376
+ else:
377
+ # Medium around median band
378
+ band = 0.03 # ±3% absolute sigma band around median
379
+ pool = [r for r in rows if abs(r[2] - q50) <= band]
380
+ if not pool:
381
+ # fallback: closest N to median
382
+ pool = sorted(rows, key=lambda r: abs(r[2] - q50))[: max(10, top_k)]
383
+ target_sigma = q50
384
+ query = "balanced moderate risk diversified portfolio"
385
+
386
+ if not pool:
387
+ # fallback: take closest overall
388
+ pool = sorted(rows, key=lambda r: abs(r[2] - target_sigma))[: max(10, top_k)]
389
+
390
+ # Rank inside pool
391
+ if use_embeddings and get_embed_model():
392
+ try:
393
+ model = get_embed_model()
394
+ texts = [candidate_text(*r) for r in pool]
395
+ embs = model.encode([query] + texts, normalize_embeddings=True)
396
+ qv = embs[0:1]
397
+ tv = embs[1:]
398
+ sims = (tv @ qv.T).ravel()
399
+ ranked = [pool[i] for i in np.argsort(-sims)]
400
+ except Exception:
401
+ ranked = sorted(pool, key=lambda r: abs(r[2] - target_sigma))
402
+ else:
403
+ ranked = sorted(pool, key=lambda r: abs(r[2] - target_sigma))
404
+
405
+ picks = ranked[:top_k]
406
+ out = []
407
+ for i, (wmap, er, sigma, beta) in enumerate(picks, start=1):
408
+ # normalize for display
409
+ gross = sum(abs(v) for v in wmap.values()) or 1.0
410
+ wmap = {k: v / gross for k, v in wmap.items()}
411
+ out.append({"pick": i, "weights": wmap, "er": er, "sigma": sigma, "beta": beta})
412
+ return out
413
+
414
+ # ---------------- summary ----------------
415
  def build_summary_md(lookback, horizon, rf, rf_code, erp, sigma_mkt,
416
  beta_p, er_p, sigma_p,
417
  a_sigma, b_sigma, mu_eff_sigma,
418
+ a_mu, b_mu, sigma_eff_mu,
419
+ risk_level: str,
420
+ suggestion: Optional[Dict] = None) -> str:
421
  lines = []
422
  lines.append("### Inputs")
423
+ lines.append(f"- Lookback years: **{int(lookback)}**")
424
+ lines.append(f"- Horizon years: **{int(round(horizon))}**")
425
+ lines.append(f"- Risk-free: **{fmt_pct(rf)}** from **{rf_code}**")
426
+ lines.append(f"- Market ERP: **{fmt_pct(erp)}**")
427
+ lines.append(f"- Market σ: **{fmt_pct(sigma_mkt)}**")
428
  lines.append("")
429
  lines.append("### Your portfolio (CAPM expectations)")
430
+ lines.append(f"- Beta: **{beta_p:.2f}**")
431
+ lines.append(f"- σ: **{fmt_pct(sigma_p)}**")
432
+ lines.append(f"- Expected return: **{fmt_pct(er_p)}**")
433
  lines.append("")
434
  lines.append("### Efficient alternatives on CML")
435
+ lines.append(f"- Same σ: market **{a_sigma:.2f}**, bills **{b_sigma:.2f}**, μ **{fmt_pct(mu_eff_sigma)}**")
436
+ lines.append(f"- Same μ: market **{a_mu:.2f}**, bills **{b_mu:.2f}**, σ **{fmt_pct(sigma_eff_mu)}**")
 
437
  lines.append("")
438
+ lines.append(f"### Dataset-based suggestions (risk = **{risk_level}**)")
439
+ if suggestion:
440
+ lines.append(f"- Top suggestion μ **{fmt_pct(suggestion['er'])}**, σ **{fmt_pct(suggestion['sigma'])}**, β **{suggestion['beta']:.2f}**")
441
+ else:
442
+ lines.append("- No suggestion available.")
443
  return "\n".join(lines)
444
 
445
+ # ---------------- gradio callbacks ----------------
 
446
  def search_tickers_cb(q: str):
447
  hits = yahoo_search(q)
448
  if not hits:
 
450
  opts = [f"{h['symbol']} | {h['name']} | {h['exchange']}" for h in hits]
451
  return "Select a symbol and click Add", opts
452
 
 
453
  def add_symbol(selection: str, table: pd.DataFrame):
454
  if not selection:
455
  return table, "Pick a row from Matches first"
 
471
  msg = f"Reached max of {MAX_TICKERS}"
472
  return new_table, msg
473
 
 
474
  def lock_ticker_column(tb: pd.DataFrame):
475
  if tb is None or len(tb) == 0:
476
  return pd.DataFrame(columns=["ticker", "amount_usd"])
 
481
  amounts = amounts[:len(tickers)] + [0.0] * max(0, len(tickers) - len(amounts))
482
  return pd.DataFrame({"ticker": tickers, "amount_usd": amounts})
483
 
 
484
  def set_horizon(years: float):
485
  y = max(1.0, min(100.0, float(years)))
486
  code = fred_series_for_horizon(y)
 
489
  HORIZON_YEARS = y
490
  RF_CODE = code
491
  RF_ANN = rf
492
+ return f"Risk free series {code}. Latest annual rate {rf:.2%}. Will be used on compute."
 
 
 
 
 
493
 
494
+ def compute_and_suggest(years_lookback: int,
495
+ table: pd.DataFrame,
496
+ risk_level: str,
497
+ use_embeddings: bool):
498
+ # sanitize table
499
  df = table.dropna()
500
  df["ticker"] = df["ticker"].astype(str).str.upper().str.strip()
501
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
502
 
503
  symbols = [t for t in df["ticker"].tolist() if t]
504
  if len(symbols) == 0:
505
+ return None, "Add at least one ticker", "Universe empty", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=SUG_COLS), None
506
 
507
  symbols = validate_tickers(symbols, years_lookback)
508
  if len(symbols) == 0:
509
+ return None, "Could not validate any tickers", "Universe invalid", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=SUG_COLS), None
510
 
511
+ # Universe includes market
512
+ universe = list(sorted(set([s for s in symbols] + [MARKET_TICKER])))[:MAX_TICKERS]
513
 
514
+ # amounts -> weights
515
+ dfp = df[df["ticker"].isin(symbols)].copy()
516
+ amounts = {r["ticker"]: float(r["amount_usd"]) for _, r in dfp.iterrows()}
517
  rf_ann = RF_ANN
518
 
519
+ # historical moments
520
+ moms = estimate_all_moments_aligned(universe, years_lookback, rf_ann)
521
  betas, covA, erp_ann, sigma_mkt = moms["betas"], moms["cov_ann"], moms["erp_ann"], moms["sigma_m_ann"]
522
 
523
  gross = sum(abs(v) for v in amounts.values())
524
  if gross == 0:
525
+ return None, "All amounts are zero", "Universe ok", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=SUG_COLS), None
526
  weights = {k: v / gross for k, v in amounts.items()}
527
 
528
  beta_p, er_p, sigma_p = portfolio_stats(weights, covA, betas, rf_ann, erp_ann)
 
530
  a_sigma, b_sigma, mu_eff_sigma = efficient_same_sigma(sigma_p, rf_ann, erp_ann, sigma_mkt)
531
  a_mu, b_mu, sigma_eff_mu = efficient_same_return(er_p, rf_ann, erp_ann, sigma_mkt)
532
 
533
+ # Build synthetic dataset for THIS universe each run
534
+ ds_path = dataset_path_for_universe(universe)
535
+ synth_df = build_synthetic_dataset(universe, covA, betas, rf_ann, erp_ann, rows=SYNTH_ROWS)
536
+ save_synth_csv(synth_df, ds_path)
537
+
538
+ # Suggestions from dataset (top 3)
539
+ picks = dataset_suggestions(ds_path, universe, risk_level, use_embeddings, top_k=3)
540
+
541
+ # For plot, show first suggestion if any
542
+ first_sugg = None
543
+ if picks:
544
+ first_sugg = (float(picks[0]["sigma"]), float(picks[0]["er"]))
545
+
546
+ img = plot_cml_percent(
547
  rf_ann, erp_ann, sigma_mkt,
548
  sigma_p, er_p,
549
  sigma_p, mu_eff_sigma,
550
  sigma_eff_mu, er_p,
551
+ suggestion=first_sugg
552
  )
553
 
554
+ # Build summary
555
  info = build_summary_md(
556
  years_lookback, HORIZON_YEARS, rf_ann, RF_CODE, erp_ann, sigma_mkt,
557
  beta_p, er_p, sigma_p,
558
  a_sigma, b_sigma, mu_eff_sigma,
559
+ a_mu, b_mu, sigma_eff_mu,
560
+ risk_level=risk_level,
561
+ suggestion=picks[0] if picks else None
562
  )
563
 
564
+ # Positions table
565
  rows = []
566
+ for t in symbols:
 
567
  rows.append({
568
  "ticker": t,
569
  "amount_usd": amounts.get(t, 0.0),
570
  "weight_exposure": weights.get(t, 0.0),
571
+ "beta": 1.0 if t == MARKET_TICKER else betas.get(t, np.nan),
572
  })
573
  pos_table = pd.DataFrame(rows, columns=POS_COLS)
574
 
575
+ # Suggestions table (long format)
576
+ if picks:
577
+ sugg_rows = []
578
+ for p in picks:
579
+ for k, v in sorted(p["weights"].items(), key=lambda kv: -abs(kv[1]))[:12]:
580
+ sugg_rows.append({
581
+ "pick": p["pick"],
582
+ "ticker": k,
583
+ "weight_exposure": v,
584
+ "er_%": p["er"] * 100.0,
585
+ "sigma_%": p["sigma"] * 100.0,
586
+ "beta": p["beta"],
587
+ })
588
+ sugg_table = pd.DataFrame(sugg_rows, columns=SUG_COLS)
589
+ else:
590
+ sugg_table = pd.DataFrame(columns=SUG_COLS)
591
 
592
+ uni_msg = f"Universe set to: {', '.join(universe)}"
593
+ return img, info, uni_msg, pos_table, sugg_table, ds_path
594
 
595
+ # ---------------- launch UI ----------------
596
  ensure_data_dir()
597
+
598
+ # Initialize risk-free from default horizon
599
  HORIZON_YEARS = 5.0
600
  RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
601
  RF_ANN = fetch_fred_yield_annual(RF_CODE)
 
604
  gr.Markdown(
605
  "## Efficient Portfolio Advisor\n"
606
  "Search symbols, enter dollar amounts, set your horizon. "
607
+ "Prices from **Yahoo Finance**. Risk-free from **FRED**. "
608
+ "Low/Medium/High suggestions are chosen **only** from a 1,000-row dataset generated from your current universe, "
609
+ "optionally refined with **finance embeddings**."
610
  )
611
 
612
  with gr.Row():
 
614
  q = gr.Textbox(label="Search symbol")
615
  search_note = gr.Markdown()
616
  matches = gr.Dropdown(choices=[], label="Matches")
617
+ with gr.Row():
618
+ search_btn = gr.Button("Search")
619
+ add_btn = gr.Button("Add selected to portfolio")
620
 
621
  gr.Markdown("### Portfolio positions — type dollar amounts (negatives allowed for shorts)")
622
  table = gr.Dataframe(
 
626
  col_count=(2, "fixed")
627
  )
628
 
629
+ horizon = gr.Number(label="Horizon in years (1–100)", value=int(HORIZON_YEARS), precision=0)
630
+ lookback = gr.Slider(1, 10, value=DEFAULT_LOOKBACK_YEARS, step=1, label="Lookback years for beta & sigma")
631
 
632
+ gr.Markdown("### Suggestions")
633
+ risk_level = gr.Radio(["Low", "Medium", "High"], value="Medium", label="Risk tolerance")
634
+ use_embeddings = gr.Checkbox(label="Use finance embeddings to refine picks", value=True)
635
 
636
+ run_btn = gr.Button("Compute (build dataset & suggest)")
 
 
 
 
637
 
638
  with gr.Column(scale=1):
639
  plot = gr.Image(label="Capital Market Line (CML)", type="pil")
640
  summary = gr.Markdown(label="Summary")
641
+ universe_msg = gr.Textbox(label="Universe status", interactive=False)
642
  positions = gr.Dataframe(
643
  label="Computed positions",
644
  headers=POS_COLS,
645
  datatype=["str", "number", "number", "number"],
646
  col_count=(len(POS_COLS), "fixed"),
647
+ value=pd.DataFrame(columns=POS_COLS),
648
  interactive=False
649
  )
650
  suggestions = gr.Dataframe(
651
+ label="Dataset-based suggestions (top 3 — weights shown as exposures)",
652
  headers=SUG_COLS,
653
+ datatype=["number", "str", "number", "number", "number", "number"],
654
  col_count=(len(SUG_COLS), "fixed"),
655
+ value=pd.DataFrame(columns=SUG_COLS),
656
  interactive=False
657
  )
658
+ dl = gr.File(label="Generated dataset CSV", value=None, visible=True)
 
659
 
660
+ # Wire up events
661
  def do_search(query):
662
  note, options = search_tickers_cb(query)
663
  return note, gr.update(choices=options)
 
668
  horizon.change(fn=set_horizon, inputs=horizon, outputs=universe_msg)
669
 
670
  run_btn.click(
671
+ fn=compute_and_suggest,
672
+ inputs=[lookback, table, risk_level, use_embeddings],
673
  outputs=[plot, summary, universe_msg, positions, suggestions, dl]
674
  )
675
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
  if __name__ == "__main__":
677
  demo.launch()