Tulitula commited on
Commit
8d18142
·
verified ·
1 Parent(s): 9c2fb56

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +349 -364
app.py CHANGED
@@ -1,5 +1,14 @@
 
 
 
 
 
 
 
 
 
1
  # app.py
2
- import os, io, math, json, hashlib, warnings
3
  warnings.filterwarnings("ignore")
4
 
5
  from typing import List, Tuple, Dict, Optional
@@ -7,62 +16,39 @@ from typing import List, Tuple, Dict, Optional
7
  import numpy as np
8
  import pandas as pd
9
  import matplotlib.pyplot as plt
10
- from matplotlib.ticker import PercentFormatter
11
- from PIL import Image
12
-
13
  import gradio as gr
 
14
  import requests
15
  import yfinance as yf
16
 
17
- # Optional embeddings (lazy-loaded)
18
- _EMBED_MODEL = None
19
- def get_embed_model():
20
- global _EMBED_MODEL
21
- if _EMBED_MODEL is None:
22
- try:
23
- from sentence_transformers import SentenceTransformer
24
- _EMBED_MODEL = SentenceTransformer("FinLang/finance-embeddings-investopedia")
25
- except Exception as e:
26
- _EMBED_MODEL = False
27
- return _EMBED_MODEL
28
 
29
  # ---------------- config ----------------
30
  DATA_DIR = "data"
31
  os.makedirs(DATA_DIR, exist_ok=True)
32
 
33
- MARKET_TICKER = "VOO" # “market” proxy
34
- DEFAULT_LOOKBACK_YEARS = 5
35
  MAX_TICKERS = 30
36
- SYNTH_ROWS = 1000
 
37
 
38
  # UI tables
39
  POS_COLS = ["ticker", "amount_usd", "weight_exposure", "beta"]
40
- SUG_COLS = ["pick", "ticker", "weight_exposure", "er_%", "sigma_%", "beta"]
41
 
42
- # FRED tenor map
43
  FRED_MAP = [
44
- (1, "DGS1"), (2, "DGS2"), (3, "DGS3"),
45
- (5, "DGS5"), (7, "DGS7"), (10, "DGS10"),
46
- (20, "DGS20"), (30, "DGS30"), (100, "DGS30"),
 
 
 
 
 
 
47
  ]
48
 
49
- # Session globals
50
- HORIZON_YEARS = 5.0
51
- RF_CODE = "DGS5"
52
- RF_ANN = 0.02
53
-
54
- def ensure_data_dir():
55
- os.makedirs(DATA_DIR, exist_ok=True)
56
-
57
- def dataset_path_for_universe(universe: List[str]) -> str:
58
- # unique file per universe (order-independent)
59
- key = hashlib.sha256((",".join(sorted(universe))).encode()).hexdigest()[:10]
60
- return os.path.join(DATA_DIR, f"investor_profiles_{key}.csv")
61
-
62
- # ---------------- tiny utils ----------------
63
- def fmt_pct(x: float) -> str:
64
- return f"{x*100:.2f}%"
65
-
66
  def fred_series_for_horizon(years: float) -> str:
67
  y = max(1.0, min(100.0, float(years)))
68
  for cutoff, code in FRED_MAP:
@@ -81,7 +67,29 @@ def fetch_fred_yield_annual(code: str) -> float:
81
  except Exception:
82
  return 0.03
83
 
84
- # ---------------- Yahoo search ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  def yahoo_search(query: str):
86
  if not query or len(query.strip()) == 0:
87
  return []
@@ -105,78 +113,70 @@ def yahoo_search(query: str):
105
  except Exception:
106
  return [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n/a"}]
107
 
108
- def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
109
- start = pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=years, days=7)
110
- end = pd.Timestamp.today(tz="UTC")
111
- df = yf.download(
112
- list(dict.fromkeys(tickers)),
113
- start=start.date(), end=end.date(),
114
- interval="1mo", auto_adjust=True, progress=False
115
- )["Close"]
116
- if isinstance(df, pd.Series):
117
- df = df.to_frame()
118
- df = df.dropna(how="all").fillna(method="ffill")
119
- return df
120
-
121
- def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
122
- return prices.pct_change().dropna()
123
-
124
  def validate_tickers(symbols: List[str], years: int) -> List[str]:
125
- ok, df = [], fetch_prices_monthly(list(set(symbols)), years)
126
- for s in symbols:
127
- if s in df.columns:
128
- ok.append(s)
 
 
 
 
 
 
 
129
  return ok
130
 
131
- # ---------------- moments (aligned) ----------------
132
  def get_aligned_monthly_returns(symbols: List[str], years: int) -> pd.DataFrame:
133
- uniq = [c for c in dict.fromkeys(symbols) if c != MARKET_TICKER]
134
  tickers = uniq + [MARKET_TICKER]
135
  px = fetch_prices_monthly(tickers, years)
 
 
 
 
136
  rets = monthly_returns(px)
137
- cols = [c for c in uniq if c in rets.columns] + ([MARKET_TICKER] if MARKET_TICKER in rets.columns else [])
138
  R = rets[cols].dropna(how="any")
139
- return R.loc[:, ~R.columns.duplicated()]
140
 
141
- def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
142
- R = get_aligned_monthly_returns(symbols + [MARKET_TICKER], years)
143
- if MARKET_TICKER not in R.columns or R.shape[0] < 3:
144
- raise ValueError("Not enough aligned market data")
145
 
 
 
 
 
 
 
 
146
  rf_m = rf_ann / 12.0
147
 
148
- # market series
149
- m = R[MARKET_TICKER]
150
  if isinstance(m, pd.DataFrame):
151
  m = m.iloc[:, 0].squeeze()
152
 
153
- mu_m_ann = float(m.mean() * 12.0)
154
- sigma_m_ann = float(m.std(ddof=1) * math.sqrt(12.0))
155
  erp_ann = float(mu_m_ann - rf_ann)
156
 
157
  ex_m = m - rf_m
158
  var_m = float(np.var(ex_m.values, ddof=1))
159
- var_m = max(var_m, 1e-10)
160
 
161
- # betas for each asset (including market==1)
162
  betas: Dict[str, float] = {}
163
- for s in R.columns:
164
- if s == MARKET_TICKER:
165
- betas[s] = 1.0
166
- continue
167
  ex_s = R[s] - rf_m
168
- cov_sm = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1])
169
- betas[s] = float(cov_sm / var_m)
170
-
171
- # IMPORTANT FIX: include MARKET in covariance so σ is never understated
172
- asset_cols = list(R.columns)
173
- if asset_cols:
174
- cov_m = np.cov(R[asset_cols].values.T, ddof=1)
175
- covA = pd.DataFrame(cov_m * 12.0, index=asset_cols, columns=asset_cols)
176
- else:
177
- covA = pd.DataFrame(np.zeros((0, 0)))
178
 
179
- return {"betas": betas, "cov_ann": covA, "erp_ann": erp_ann, "sigma_m_ann": sigma_m_ann}
180
 
181
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
182
  return float(rf_ann + beta * erp_ann)
@@ -195,71 +195,73 @@ def portfolio_stats(weights: Dict[str, float],
195
  beta_p = float(np.dot([betas.get(t, 0.0) for t in tickers], w_expo))
196
  er_p = capm_er(beta_p, rf_ann, erp_ann)
197
  cov = cov_ann.reindex(index=tickers, columns=tickers).fillna(0.0).to_numpy()
198
- sigma_p = math.sqrt(float(max(w_expo.T @ cov @ w_expo, 0.0)))
 
199
  return beta_p, er_p, sigma_p
200
 
201
- # ---------------- CML helpers & plot ----------------
202
  def efficient_same_sigma(sigma_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
203
  if sigma_mkt <= 1e-12:
204
  return 0.0, 1.0, rf_ann
205
  a = sigma_target / sigma_mkt
206
- return a, 1 - a, rf_ann + a * erp_ann
207
 
208
  def efficient_same_return(mu_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
209
  if abs(erp_ann) <= 1e-12:
210
  return 0.0, 1.0, rf_ann
211
  a = (mu_target - rf_ann) / erp_ann
212
- return a, 1 - a, abs(a) * sigma_mkt
213
 
214
- def plot_cml_percent(rf_ann, erp_ann, sigma_mkt,
215
- pt_sigma, pt_mu,
216
- same_sigma_sigma, same_sigma_mu,
217
- same_mu_sigma, same_mu_mu,
218
- suggestion: Optional[Tuple[float, float]] = None) -> Image.Image:
219
- fig = plt.figure(figsize=(6, 4), dpi=120)
 
 
220
 
221
  xmax = max(
222
- 0.3,
223
  sigma_mkt * 2.0,
224
  pt_sigma * 1.4,
225
- same_sigma_sigma * 1.4,
226
  same_mu_sigma * 1.4,
227
- (suggestion[0] if suggestion else 0.0) * 1.5,
 
228
  )
229
  xs = np.linspace(0, xmax, 160)
230
  slope = erp_ann / max(sigma_mkt, 1e-12)
231
  cml = rf_ann + slope * xs
232
- plt.plot(xs, cml, label="CML via Market")
233
-
234
- # Points
235
- plt.scatter([0.0], [rf_ann], label="Risk-free (FRED)")
236
- plt.scatter([sigma_mkt], [rf_ann + erp_ann], label=f"Market {MARKET_TICKER}")
237
- plt.scatter([pt_sigma], [pt_mu], label="Your portfolio")
238
- plt.scatter([same_sigma_sigma], [same_sigma_mu], label="Efficient same sigma")
239
- plt.scatter([same_mu_sigma], [same_mu_mu], label="Efficient same return")
240
- if suggestion is not None:
241
- plt.scatter([suggestion[0]], [suggestion[1]], marker="X", s=70, label="Suggestion")
242
-
243
- # Guides (percent annotated)
244
- plt.plot([pt_sigma, same_sigma_sigma], [pt_mu, same_sigma_mu], ls="--", lw=1.0, alpha=0.7, c="gray")
245
- d_ret = (same_sigma_mu - pt_mu) * 100.0
246
- plt.annotate(f"Return gain at same σ {d_ret:+.2f}%",
247
- xy=(same_sigma_sigma, same_sigma_mu),
248
- xytext=(same_sigma_sigma, same_sigma_mu + 0.03),
249
- arrowprops=dict(arrowstyle="->", lw=1.0), fontsize=9, ha="center")
250
-
251
- plt.plot([pt_sigma, same_mu_sigma], [pt_mu, same_mu_mu], ls="--", lw=1.0, alpha=0.7, c="gray")
252
- d_sig = (same_mu_sigma - pt_sigma) * 100.0
253
- plt.annotate(f"Risk change at same μ {d_sig:+.2f}%",
254
- xy=(same_mu_sigma, same_mu_mu),
255
- xytext=(same_mu_sigma + 0.01, same_mu_mu),
256
- arrowprops=dict(arrowstyle="->", lw=1.0), fontsize=9, va="center")
257
 
258
  plt.xlabel("σ (annualized)")
259
  plt.ylabel("Expected return (annual)")
260
- plt.gca().xaxis.set_major_formatter(PercentFormatter(1.0))
261
- plt.gca().yaxis.set_major_formatter(PercentFormatter(1.0))
262
- plt.legend(loc="best")
263
  plt.tight_layout()
264
 
265
  buf = io.BytesIO()
@@ -268,161 +270,149 @@ def plot_cml_percent(rf_ann, erp_ann, sigma_mkt,
268
  buf.seek(0)
269
  return Image.open(buf)
270
 
271
- # ---------------- synthetic dataset ----------------
272
- def synth_profile(seed: int) -> str:
273
- rng = np.random.default_rng(seed)
274
- risk = rng.choice(["cautious", "balanced", "moderate", "growth", "aggressive"])
275
- horizon = rng.choice(["3y", "5y", "7y", "10y", "15y"])
276
- goal = rng.choice(["retirement", "first home", "education", "wealth building", "travel", "emergency"])
277
- return f"{risk} investor, {horizon} horizon, goal {goal}"
278
-
279
- def build_synthetic_dataset(universe: List[str],
280
- covA: pd.DataFrame,
281
- betas: Dict[str, float],
282
- rf_ann: float,
283
- erp_ann: float,
284
- rows: int = SYNTH_ROWS) -> pd.DataFrame:
285
- # Ensure MARKET in universe (we may sample it too)
286
- symbols = list(sorted(set(universe + [MARKET_TICKER])))[:MAX_TICKERS]
287
- rng = np.random.default_rng(123)
288
- data = []
289
- for i in range(rows):
290
- k = rng.integers(low=min(2, len(symbols)), high=min(8, len(symbols)) + 1)
 
 
291
  picks = list(rng.choice(symbols, size=k, replace=False))
292
  signs = rng.choice([-1.0, 1.0], size=k, p=[0.25, 0.75])
293
  raw = rng.dirichlet(np.ones(k))
294
- gross = 1.0 + float(rng.gamma(2.0, 0.5))
295
  w = gross * signs * raw
296
- wmap = {picks[j]: w[j] for j in range(k)}
297
-
298
- beta_p, er_p, sigma_p = portfolio_stats(wmap, covA, betas, rf_ann, erp_ann)
299
- data.append({
300
  "id": i,
301
- "profile_text": synth_profile(10_000 + i),
302
  "tickers": ",".join(picks),
303
- "weights": ",".join(f"{x:.5f}" for x in w),
304
  "beta_p": beta_p,
305
  "er_p": er_p,
306
  "sigma_p": sigma_p
307
  })
308
- return pd.DataFrame(data)
309
 
310
- def save_synth_csv(df: pd.DataFrame, path: str):
311
- os.makedirs(os.path.dirname(path), exist_ok=True)
 
312
  df.to_csv(path, index=False)
313
-
314
- def _row_to_exposures(row: pd.Series, universe: List[str]) -> Optional[np.ndarray]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  try:
316
- ts = [t.strip() for t in str(row["tickers"]).split(",")]
317
- ws = [float(x) for x in str(row["weights"]).split(",")]
318
- wmap = {t: ws[i] for i, t in enumerate(ts) if i < len(ws)}
319
- x = np.array([wmap.get(t, 0.0) for t in universe], dtype=float)
320
- gross = float(np.sum(np.abs(x)))
321
- if gross <= 1e-12:
322
- return None
323
- return x / gross
324
  except Exception:
325
  return None
 
 
326
 
327
- def candidate_text(weights_map: Dict[str, float], er: float, sigma: float, beta: float) -> str:
328
- top = sorted(weights_map.items(), key=lambda kv: -abs(kv[1]))[:6]
329
- parts = [f"{k} {v:+.2f}" for k, v in top]
330
- return (
331
- f"portfolio with expected return {er:.4f}, volatility {sigma:.4f}, beta {beta:.2f}. "
332
- f"top exposures: {'; '.join(parts)}"
333
- )
334
-
335
- def dataset_suggestions(csv_path: str,
336
- universe: List[str],
337
- risk_level: str,
338
- use_embeddings: bool,
339
- top_k: int = 3):
340
- try:
341
- df = pd.read_csv(csv_path)
342
- except Exception:
343
- return []
344
 
345
- # Build rows usable for this universe
346
- rows = []
347
- for _, r in df.iterrows():
348
- x = _row_to_exposures(r, universe)
349
- if x is None:
350
- continue
351
- # recover a printable mapping for display
352
- ts = [t.strip() for t in str(r["tickers"]).split(",")]
353
- ws = [float(x) for x in str(r["weights"]).split(",")]
354
- wmap = {}
355
- for i in range(min(len(ts), len(ws))):
356
- wmap[ts[i]] = ws[i]
357
- gross = sum(abs(v) for v in wmap.values()) or 1.0
358
- wmap = {k: v / gross for k, v in wmap.items()}
359
- rows.append((wmap, float(r["er_p"]), float(r["sigma_p"]), float(r["beta_p"])))
360
-
361
- if not rows:
362
- return []
363
 
364
- # Risk buckets by sigma
365
- sigmas = np.array([r[2] for r in rows])
366
- q10, q50, q90 = np.quantile(sigmas, [0.10, 0.50, 0.90])
367
-
368
- if risk_level == "Low":
369
- pool = [r for r in rows if r[2] <= q10]
370
- target_sigma = q10
371
- query = "low risk conservative stable portfolio minimize volatility"
372
- elif risk_level == "High":
373
- pool = [r for r in rows if r[2] >= q90]
374
- target_sigma = q90
375
- query = "high risk aggressive growth portfolio accept high volatility maximize returns"
376
- else:
377
- # Medium around median band
378
- band = 0.03 # ±3% absolute sigma band around median
379
- pool = [r for r in rows if abs(r[2] - q50) <= band]
380
- if not pool:
381
- # fallback: closest N to median
382
- pool = sorted(rows, key=lambda r: abs(r[2] - q50))[: max(10, top_k)]
383
- target_sigma = q50
384
- query = "balanced moderate risk diversified portfolio"
385
-
386
- if not pool:
387
- # fallback: take closest overall
388
- pool = sorted(rows, key=lambda r: abs(r[2] - target_sigma))[: max(10, top_k)]
389
-
390
- # Rank inside pool
391
- if use_embeddings and get_embed_model():
392
  try:
393
- model = get_embed_model()
394
- texts = [candidate_text(*r) for r in pool]
395
- embs = model.encode([query] + texts, normalize_embeddings=True)
396
- qv = embs[0:1]
397
- tv = embs[1:]
398
- sims = (tv @ qv.T).ravel()
399
- ranked = [pool[i] for i in np.argsort(-sims)]
 
400
  except Exception:
401
- ranked = sorted(pool, key=lambda r: abs(r[2] - target_sigma))
402
  else:
403
- ranked = sorted(pool, key=lambda r: abs(r[2] - target_sigma))
404
-
405
- picks = ranked[:top_k]
406
- out = []
407
- for i, (wmap, er, sigma, beta) in enumerate(picks, start=1):
408
- # normalize for display
409
- gross = sum(abs(v) for v in wmap.values()) or 1.0
410
- wmap = {k: v / gross for k, v in wmap.items()}
411
- out.append({"pick": i, "weights": wmap, "er": er, "sigma": sigma, "beta": beta})
412
- return out
413
-
414
- # ---------------- summary ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
  def build_summary_md(lookback, horizon, rf, rf_code, erp, sigma_mkt,
416
  beta_p, er_p, sigma_p,
417
  a_sigma, b_sigma, mu_eff_sigma,
418
  a_mu, b_mu, sigma_eff_mu,
419
- risk_level: str,
420
- suggestion: Optional[Dict] = None) -> str:
421
  lines = []
422
  lines.append("### Inputs")
423
- lines.append(f"- Lookback years: **{int(lookback)}**")
424
  lines.append(f"- Horizon years: **{int(round(horizon))}**")
425
- lines.append(f"- Risk-free: **{fmt_pct(rf)}** from **{rf_code}**")
426
  lines.append(f"- Market ERP: **{fmt_pct(erp)}**")
427
  lines.append(f"- Market σ: **{fmt_pct(sigma_mkt)}**")
428
  lines.append("")
@@ -432,17 +422,25 @@ def build_summary_md(lookback, horizon, rf, rf_code, erp, sigma_mkt,
432
  lines.append(f"- Expected return: **{fmt_pct(er_p)}**")
433
  lines.append("")
434
  lines.append("### Efficient alternatives on CML")
435
- lines.append(f"- Same σ: market **{a_sigma:.2f}**, bills **{b_sigma:.2f}**, μ **{fmt_pct(mu_eff_sigma)}**")
436
- lines.append(f"- Same μ: market **{a_mu:.2f}**, bills **{b_mu:.2f}**, σ **{fmt_pct(sigma_eff_mu)}**")
437
- lines.append("")
438
- lines.append(f"### Dataset-based suggestions (risk = **{risk_level}**)")
439
- if suggestion:
440
- lines.append(f"- Top suggestion μ **{fmt_pct(suggestion['er'])}**, σ **{fmt_pct(suggestion['sigma'])}**, β **{suggestion['beta']:.2f}**")
441
- else:
442
- lines.append("- No suggestion available.")
 
 
443
  return "\n".join(lines)
444
 
445
- # ---------------- gradio callbacks ----------------
 
 
 
 
 
 
446
  def search_tickers_cb(q: str):
447
  hits = yahoo_search(q)
448
  if not hits:
@@ -489,124 +487,107 @@ def set_horizon(years: float):
489
  HORIZON_YEARS = y
490
  RF_CODE = code
491
  RF_ANN = rf
492
- return f"Risk free series {code}. Latest annual rate {rf:.2%}. Will be used on compute."
493
-
494
- def compute_and_suggest(years_lookback: int,
495
- table: pd.DataFrame,
496
- risk_level: str,
497
- use_embeddings: bool):
498
- # sanitize table
499
- df = table.dropna()
 
 
 
500
  df["ticker"] = df["ticker"].astype(str).str.upper().str.strip()
501
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
502
 
503
  symbols = [t for t in df["ticker"].tolist() if t]
 
504
  if len(symbols) == 0:
505
- return None, "Add at least one ticker", "Universe empty", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=SUG_COLS), None
506
-
507
- symbols = validate_tickers(symbols, years_lookback)
508
- if len(symbols) == 0:
509
- return None, "Could not validate any tickers", "Universe invalid", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=SUG_COLS), None
510
 
511
- # Universe includes market
512
- universe = list(sorted(set([s for s in symbols] + [MARKET_TICKER])))[:MAX_TICKERS]
513
 
514
- # amounts -> weights
515
- dfp = df[df["ticker"].isin(symbols)].copy()
516
- amounts = {r["ticker"]: float(r["amount_usd"]) for _, r in dfp.iterrows()}
517
  rf_ann = RF_ANN
518
 
519
- # historical moments
520
- moms = estimate_all_moments_aligned(universe, years_lookback, rf_ann)
521
  betas, covA, erp_ann, sigma_mkt = moms["betas"], moms["cov_ann"], moms["erp_ann"], moms["sigma_m_ann"]
522
 
523
- gross = sum(abs(v) for v in amounts.values())
524
- if gross == 0:
525
- return None, "All amounts are zero", "Universe ok", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=SUG_COLS), None
526
- weights = {k: v / gross for k, v in amounts.items()}
527
 
 
528
  beta_p, er_p, sigma_p = portfolio_stats(weights, covA, betas, rf_ann, erp_ann)
529
 
530
  a_sigma, b_sigma, mu_eff_sigma = efficient_same_sigma(sigma_p, rf_ann, erp_ann, sigma_mkt)
531
  a_mu, b_mu, sigma_eff_mu = efficient_same_return(er_p, rf_ann, erp_ann, sigma_mkt)
532
 
533
- # Build synthetic dataset for THIS universe each run
534
- ds_path = dataset_path_for_universe(universe)
535
- synth_df = build_synthetic_dataset(universe, covA, betas, rf_ann, erp_ann, rows=SYNTH_ROWS)
536
- save_synth_csv(synth_df, ds_path)
537
-
538
- # Suggestions from dataset (top 3)
539
- picks = dataset_suggestions(ds_path, universe, risk_level, use_embeddings, top_k=3)
540
-
541
- # For plot, show first suggestion if any
542
- first_sugg = None
543
- if picks:
544
- first_sugg = (float(picks[0]["sigma"]), float(picks[0]["er"]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
 
546
- img = plot_cml_percent(
 
547
  rf_ann, erp_ann, sigma_mkt,
548
  sigma_p, er_p,
549
  sigma_p, mu_eff_sigma,
550
  sigma_eff_mu, er_p,
551
- suggestion=first_sugg
552
  )
553
 
554
- # Build summary
555
  info = build_summary_md(
556
- years_lookback, HORIZON_YEARS, rf_ann, RF_CODE, erp_ann, sigma_mkt,
557
  beta_p, er_p, sigma_p,
558
  a_sigma, b_sigma, mu_eff_sigma,
559
  a_mu, b_mu, sigma_eff_mu,
560
- risk_level=risk_level,
561
- suggestion=picks[0] if picks else None
562
  )
563
 
564
- # Positions table
565
- rows = []
566
- for t in symbols:
567
- rows.append({
568
- "ticker": t,
569
- "amount_usd": amounts.get(t, 0.0),
570
- "weight_exposure": weights.get(t, 0.0),
571
- "beta": 1.0 if t == MARKET_TICKER else betas.get(t, np.nan),
572
- })
573
- pos_table = pd.DataFrame(rows, columns=POS_COLS)
574
-
575
- # Suggestions table (long format)
576
- if picks:
577
- sugg_rows = []
578
- for p in picks:
579
- for k, v in sorted(p["weights"].items(), key=lambda kv: -abs(kv[1]))[:12]:
580
- sugg_rows.append({
581
- "pick": p["pick"],
582
- "ticker": k,
583
- "weight_exposure": v,
584
- "er_%": p["er"] * 100.0,
585
- "sigma_%": p["sigma"] * 100.0,
586
- "beta": p["beta"],
587
- })
588
- sugg_table = pd.DataFrame(sugg_rows, columns=SUG_COLS)
589
- else:
590
- sugg_table = pd.DataFrame(columns=SUG_COLS)
591
-
592
- uni_msg = f"Universe set to: {', '.join(universe)}"
593
- return img, info, uni_msg, pos_table, sugg_table, ds_path
594
-
595
- # ---------------- launch UI ----------------
596
- ensure_data_dir()
597
-
598
- # Initialize risk-free from default horizon
599
- HORIZON_YEARS = 5.0
600
- RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
601
- RF_ANN = fetch_fred_yield_annual(RF_CODE)
602
 
 
603
  with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
604
  gr.Markdown(
605
  "## Efficient Portfolio Advisor\n"
606
- "Search symbols, enter dollar amounts, set your horizon. "
607
- "Prices from **Yahoo Finance**. Risk-free from **FRED**. "
608
- "Low/Medium/High suggestions are chosen **only** from a 1,000-row dataset generated from your current universe, "
609
- "optionally refined with **finance embeddings**."
610
  )
611
 
612
  with gr.Row():
@@ -623,22 +604,24 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
623
  headers=["ticker", "amount_usd"],
624
  datatype=["str", "number"],
625
  row_count=0,
626
- col_count=(2, "fixed")
 
627
  )
628
 
629
- horizon = gr.Number(label="Horizon in years (1–100)", value=int(HORIZON_YEARS), precision=0)
630
  lookback = gr.Slider(1, 10, value=DEFAULT_LOOKBACK_YEARS, step=1, label="Lookback years for beta & sigma")
631
 
632
  gr.Markdown("### Suggestions")
633
- risk_level = gr.Radio(["Low", "Medium", "High"], value="Medium", label="Risk tolerance")
634
- use_embeddings = gr.Checkbox(label="Use finance embeddings to refine picks", value=True)
635
 
636
  run_btn = gr.Button("Compute (build dataset & suggest)")
637
 
638
  with gr.Column(scale=1):
639
  plot = gr.Image(label="Capital Market Line (CML)", type="pil")
640
- summary = gr.Markdown(label="Summary")
641
  universe_msg = gr.Textbox(label="Universe status", interactive=False)
 
642
  positions = gr.Dataframe(
643
  label="Computed positions",
644
  headers=POS_COLS,
@@ -647,17 +630,18 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
647
  value=pd.DataFrame(columns=POS_COLS),
648
  interactive=False
649
  )
 
650
  suggestions = gr.Dataframe(
651
- label="Dataset-based suggestions (top 3 weights shown as exposures)",
652
- headers=SUG_COLS,
653
- datatype=["number", "str", "number", "number", "number", "number"],
654
- col_count=(len(SUG_COLS), "fixed"),
655
- value=pd.DataFrame(columns=SUG_COLS),
656
  interactive=False
657
  )
 
658
  dl = gr.File(label="Generated dataset CSV", value=None, visible=True)
659
 
660
- # Wire up events
661
  def do_search(query):
662
  note, options = search_tickers_cb(query)
663
  return note, gr.update(choices=options)
@@ -668,10 +652,11 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
668
  horizon.change(fn=set_horizon, inputs=horizon, outputs=universe_msg)
669
 
670
  run_btn.click(
671
- fn=compute_and_suggest,
672
- inputs=[lookback, table, risk_level, use_embeddings],
673
  outputs=[plot, summary, universe_msg, positions, suggestions, dl]
674
  )
675
 
676
  if __name__ == "__main__":
677
  demo.launch()
 
 
1
+ Here’s a full, drop-in **app.py** that:
2
+
3
+ * keeps the ticker search + portfolio table UX you liked
4
+ * shows the CML with **percent axes**
5
+ * builds a **1,000-row synthetic dataset** for your current universe
6
+ * gives a **single, clean suggestion** (based on Low/Medium/High risk) as **weights (%) and dollars (\$)**
7
+ * can optionally **re-rank** the suggestion with **finance embeddings** (FinLang)
8
+
9
+ ```python
10
  # app.py
11
+ import os, io, math, json, warnings, hashlib, random
12
  warnings.filterwarnings("ignore")
13
 
14
  from typing import List, Tuple, Dict, Optional
 
16
  import numpy as np
17
  import pandas as pd
18
  import matplotlib.pyplot as plt
 
 
 
19
  import gradio as gr
20
+ from PIL import Image
21
  import requests
22
  import yfinance as yf
23
 
24
+ from sklearn.neighbors import KNeighborsRegressor
25
+ from sklearn.preprocessing import StandardScaler
 
 
 
 
 
 
 
 
 
26
 
27
  # ---------------- config ----------------
28
  DATA_DIR = "data"
29
  os.makedirs(DATA_DIR, exist_ok=True)
30
 
 
 
31
  MAX_TICKERS = 30
32
+ DEFAULT_LOOKBACK_YEARS = 10
33
+ MARKET_TICKER = "VOO" # fall back to SPY if needed
34
 
35
  # UI tables
36
  POS_COLS = ["ticker", "amount_usd", "weight_exposure", "beta"]
 
37
 
38
+ # FRED curve mapping: horizon -> series code
39
  FRED_MAP = [
40
+ (1, "DGS1"),
41
+ (2, "DGS2"),
42
+ (3, "DGS3"),
43
+ (5, "DGS5"),
44
+ (7, "DGS7"),
45
+ (10, "DGS10"),
46
+ (20, "DGS20"),
47
+ (30, "DGS30"),
48
+ (100, "DGS30"),
49
  ]
50
 
51
+ # ---------------- helpers ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def fred_series_for_horizon(years: float) -> str:
53
  y = max(1.0, min(100.0, float(years)))
54
  for cutoff, code in FRED_MAP:
 
67
  except Exception:
68
  return 0.03
69
 
70
+ def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
71
+ start = pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=years, days=7)
72
+ end = pd.Timestamp.today(tz="UTC")
73
+ df = yf.download(
74
+ list(dict.fromkeys(tickers)),
75
+ start=start.date(),
76
+ end=end.date(),
77
+ interval="1mo",
78
+ auto_adjust=True,
79
+ progress=False,
80
+ group_by="ticker",
81
+ )["Close"]
82
+ if isinstance(df, pd.Series):
83
+ df = df.to_frame()
84
+ df = df.dropna(how="all").fillna(method="ffill")
85
+ # If yfinance returns MultiIndex columns for multiple tickers, flatten
86
+ if isinstance(df.columns, pd.MultiIndex):
87
+ df.columns = [c[0] for c in df.columns]
88
+ return df
89
+
90
+ def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
91
+ return prices.pct_change().dropna()
92
+
93
  def yahoo_search(query: str):
94
  if not query or len(query.strip()) == 0:
95
  return []
 
113
  except Exception:
114
  return [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n/a"}]
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  def validate_tickers(symbols: List[str], years: int) -> List[str]:
117
+ if not symbols:
118
+ return []
119
+ # Always include market proxy so alignment works
120
+ base = [s for s in dict.fromkeys(symbols)]
121
+ px = fetch_prices_monthly(base + [MARKET_TICKER], years)
122
+ ok = [s for s in base if s in px.columns]
123
+ # If market ticker missing, try SPY as fallback
124
+ if MARKET_TICKER not in px.columns and "SPY" not in px.columns:
125
+ # Try once more with SPY added
126
+ px2 = fetch_prices_monthly(base + ["SPY"], years)
127
+ ok = [s for s in base if s in px2.columns]
128
  return ok
129
 
130
+ # -------------- aligned moments --------------
131
  def get_aligned_monthly_returns(symbols: List[str], years: int) -> pd.DataFrame:
132
+ uniq = [c for c in dict.fromkeys(symbols) if c]
133
  tickers = uniq + [MARKET_TICKER]
134
  px = fetch_prices_monthly(tickers, years)
135
+ # if VOO missing, try SPY as market
136
+ mkt = MARKET_TICKER if MARKET_TICKER in px.columns else ("SPY" if "SPY" in px.columns else None)
137
+ if mkt is None:
138
+ return pd.DataFrame()
139
  rets = monthly_returns(px)
140
+ cols = [c for c in uniq if c in rets.columns] + [mkt]
141
  R = rets[cols].dropna(how="any")
142
+ return R, mkt
143
 
144
+ def annualize_mean(m):
145
+ return np.asarray(m, dtype=float) * 12.0
 
 
146
 
147
+ def annualize_sigma(s):
148
+ return np.asarray(s, dtype=float) * math.sqrt(12.0)
149
+
150
+ def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
151
+ R, mkt = get_aligned_monthly_returns(symbols, years)
152
+ if R is None or R.empty or mkt is None or R.shape[0] < 3:
153
+ raise ValueError("Not enough aligned data for selected tickers / lookback.")
154
  rf_m = rf_ann / 12.0
155
 
156
+ m = R[mkt]
 
157
  if isinstance(m, pd.DataFrame):
158
  m = m.iloc[:, 0].squeeze()
159
 
160
+ mu_m_ann = float(annualize_mean(m.mean()))
161
+ sigma_m_ann = float(annualize_sigma(m.std(ddof=1)))
162
  erp_ann = float(mu_m_ann - rf_ann)
163
 
164
  ex_m = m - rf_m
165
  var_m = float(np.var(ex_m.values, ddof=1))
166
+ var_m = max(var_m, 1e-6)
167
 
 
168
  betas: Dict[str, float] = {}
169
+ for s in [c for c in R.columns if c != mkt]:
 
 
 
170
  ex_s = R[s] - rf_m
171
+ betas[s] = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1] / var_m)
172
+
173
+ betas[mkt] = 1.0
174
+ # asset covariance (annualized) excluding market column
175
+ asset_cols = [c for c in R.columns if c != mkt]
176
+ cov_m = np.cov(R[asset_cols].values.T, ddof=1) if asset_cols else np.zeros((0, 0))
177
+ covA = pd.DataFrame(cov_m * 12.0, index=asset_cols, columns=asset_cols)
 
 
 
178
 
179
+ return {"betas": betas, "cov_ann": covA, "erp_ann": erp_ann, "sigma_m_ann": sigma_m_ann, "mkt": mkt}
180
 
181
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
182
  return float(rf_ann + beta * erp_ann)
 
195
  beta_p = float(np.dot([betas.get(t, 0.0) for t in tickers], w_expo))
196
  er_p = capm_er(beta_p, rf_ann, erp_ann)
197
  cov = cov_ann.reindex(index=tickers, columns=tickers).fillna(0.0).to_numpy()
198
+ v = float(w_expo.T @ cov @ w_expo)
199
+ sigma_p = math.sqrt(max(v, 0.0))
200
  return beta_p, er_p, sigma_p
201
 
202
+ # -------------- CML helpers --------------
203
  def efficient_same_sigma(sigma_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
204
  if sigma_mkt <= 1e-12:
205
  return 0.0, 1.0, rf_ann
206
  a = sigma_target / sigma_mkt
207
+ return a, 1.0 - a, rf_ann + a * erp_ann
208
 
209
  def efficient_same_return(mu_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
210
  if abs(erp_ann) <= 1e-12:
211
  return 0.0, 1.0, rf_ann
212
  a = (mu_target - rf_ann) / erp_ann
213
+ return a, 1.0 - a, abs(a) * sigma_mkt
214
 
215
+ def plot_cml(
216
+ rf_ann, erp_ann, sigma_mkt,
217
+ pt_sigma, pt_mu,
218
+ same_sigma_sigma, same_sigma_mu,
219
+ same_mu_sigma, same_mu_mu,
220
+ sugg_sigma=None, sugg_mu=None
221
+ ) -> Image.Image:
222
+ fig = plt.figure(figsize=(6.2, 4.2), dpi=120)
223
 
224
  xmax = max(
225
+ 0.30,
226
  sigma_mkt * 2.0,
227
  pt_sigma * 1.4,
 
228
  same_mu_sigma * 1.4,
229
+ same_sigma_sigma * 1.4,
230
+ (sugg_sigma or 0.0) * 1.4,
231
  )
232
  xs = np.linspace(0, xmax, 160)
233
  slope = erp_ann / max(sigma_mkt, 1e-12)
234
  cml = rf_ann + slope * xs
235
+ plt.plot(xs * 100.0, cml * 100.0, label="CML via Market")
236
+
237
+ # key points
238
+ plt.scatter([0.0], [rf_ann * 100.0], label="Risk-free (FRED)")
239
+ plt.scatter([sigma_mkt * 100.0], [(rf_ann + erp_ann) * 100.0], label="Market (VOO)")
240
+ plt.scatter([pt_sigma * 100.0], [pt_mu * 100.0], label="Your portfolio")
241
+
242
+ plt.scatter([same_sigma_sigma * 100.0], [same_sigma_mu * 100.0], label="Efficient same sigma")
243
+ plt.scatter([same_mu_sigma * 100.0], [same_mu_mu * 100.0], label="Efficient same return")
244
+
245
+ if sugg_sigma is not None and sugg_mu is not None:
246
+ plt.scatter([sugg_sigma * 100.0], [sugg_mu * 100.0], label="Suggestion")
247
+
248
+ # simple guides
249
+ plt.plot(
250
+ [pt_sigma * 100.0, same_sigma_sigma * 100.0],
251
+ [pt_mu * 100.0, same_sigma_mu * 100.0],
252
+ linestyle="--", linewidth=1.1, alpha=0.7, color="gray",
253
+ )
254
+ plt.plot(
255
+ [pt_sigma * 100.0, same_mu_sigma * 100.0],
256
+ [pt_mu * 100.0, same_mu_mu * 100.0],
257
+ linestyle="--", linewidth=1.1, alpha=0.7, color="gray",
258
+ )
 
259
 
260
  plt.xlabel("σ (annualized)")
261
  plt.ylabel("Expected return (annual)")
262
+ plt.gca().xaxis.set_major_formatter(lambda v, pos: f"{v:.0f}%")
263
+ plt.gca().yaxis.set_major_formatter(lambda v, pos: f"{v:.0f}%")
264
+ plt.legend(loc="best", fontsize=8)
265
  plt.tight_layout()
266
 
267
  buf = io.BytesIO()
 
270
  buf.seek(0)
271
  return Image.open(buf)
272
 
273
+ # -------------- synthetic dataset --------------
274
+ def _row_to_exposures(row: pd.Series, universe: List[str]) -> Optional[np.ndarray]:
275
+ try:
276
+ ts = [t.strip().upper() for t in str(row["tickers"]).split(",") if t.strip()]
277
+ ws = [float(x) for x in str(row["weights"]).split(",")]
278
+ wmap = {t: ws[i] for i, t in enumerate(ts) if i < len(ws)}
279
+ w = np.array([wmap.get(t, 0.0) for t in universe], dtype=float)
280
+ gross = float(np.sum(np.abs(w)))
281
+ if gross <= 1e-12:
282
+ return None
283
+ return w / gross
284
+ except Exception:
285
+ return None
286
+
287
+ def build_synthetic_dataset(universe: List[str], years: int, rf_ann: float, erp_ann: float) -> pd.DataFrame:
288
+ symbols = list(sorted(set([s for s in universe if s])))
289
+ moms = estimate_all_moments_aligned(symbols, years, rf_ann)
290
+ covA, betas = moms["cov_ann"], moms["betas"]
291
+
292
+ rows, rng = [], np.random.default_rng(12345)
293
+ for i in range(1000):
294
+ k = int(rng.integers(low=min(2, len(symbols)), high=min(8, len(symbols)) + 1))
295
  picks = list(rng.choice(symbols, size=k, replace=False))
296
  signs = rng.choice([-1.0, 1.0], size=k, p=[0.25, 0.75])
297
  raw = rng.dirichlet(np.ones(k))
298
+ gross = 1.0 + float(rng.gamma(2.0, 0.7))
299
  w = gross * signs * raw
300
+ beta_p, er_p, sigma_p = portfolio_stats({picks[j]: w[j] for j in range(k)}, covA, betas, rf_ann, erp_ann)
301
+ rows.append({
 
 
302
  "id": i,
 
303
  "tickers": ",".join(picks),
304
+ "weights": ",".join(f"{x:.6f}" for x in w),
305
  "beta_p": beta_p,
306
  "er_p": er_p,
307
  "sigma_p": sigma_p
308
  })
309
+ return pd.DataFrame(rows)
310
 
311
+ def save_synth_csv(df: pd.DataFrame, universe: List[str]) -> str:
312
+ sig = hashlib.md5((",".join(sorted(universe)) + f":{len(df)}").encode()).hexdigest()[:8]
313
+ path = os.path.join(DATA_DIR, f"investor_profiles_{sig}.csv")
314
  df.to_csv(path, index=False)
315
+ return path
316
+
317
+ # -------------- suggestion logic (dataset only, optional embeddings) --------------
318
+ def describe_candidate_text(row: pd.Series, universe: List[str]) -> str:
319
+ xs = _row_to_exposures(row, universe)
320
+ if xs is None:
321
+ return ""
322
+ parts = []
323
+ for t, w in sorted(zip(universe, xs), key=lambda z: -abs(z[1]))[:8]:
324
+ if abs(w) > 1e-4:
325
+ parts.append(f"{t} {w:+.2f}")
326
+ desc = " ".join(parts)
327
+ return f"weights {desc}; beta {row['beta_p']:.2f}; sigma {row['sigma_p']:.2f}; return {row['er_p']:.2f}"
328
+
329
+ def pick_by_risk_from_dataset(csv_path: str,
330
+ universe: List[str],
331
+ risk_label: str,
332
+ use_embeddings: bool) -> Optional[Dict]:
333
  try:
334
+ df = pd.read_csv(csv_path)
 
 
 
 
 
 
 
335
  except Exception:
336
  return None
337
+ if df.empty:
338
+ return None
339
 
340
+ # candidates by sigma
341
+ sigmas = df["sigma_p"].astype(float).values
342
+ order_low = np.argsort(sigmas)
343
+ order_high = order_low[::-1]
344
+ med_value = float(np.median(sigmas))
345
+ order_mid = np.argsort(np.abs(sigmas - med_value))
346
+
347
+ if risk_label.lower() == "low":
348
+ idxs = order_low[:30]
349
+ elif risk_label.lower() == "high":
350
+ idxs = order_high[:30]
351
+ else:
352
+ idxs = order_mid[:30]
 
 
 
 
353
 
354
+ sub = df.iloc[idxs].copy()
355
+ if sub.empty:
356
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
+ # optional: rerank with finance embeddings against a risk prompt
359
+ if use_embeddings:
360
+ prompt_map = {
361
+ "low": "low risk, stable, diversified, defensive, downside protection",
362
+ "medium": "balanced risk, moderate volatility, diversified growth and income",
363
+ "high": "high risk, aggressive growth, momentum, high volatility"
364
+ }
365
+ prompt = prompt_map.get(risk_label.lower(), prompt_map["medium"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  try:
367
+ from sentence_transformers import SentenceTransformer, util
368
+ model = SentenceTransformer("FinLang/finance-embeddings-investopedia")
369
+ cand_texts = [describe_candidate_text(r, universe) for _, r in sub.iterrows()]
370
+ emb_prompt = model.encode([prompt], normalize_embeddings=True)
371
+ emb_cands = model.encode(cand_texts, normalize_embeddings=True)
372
+ sims = util.cos_sim(emb_prompt, emb_cands).cpu().numpy()[0]
373
+ best_i = int(np.argsort(-sims)[0])
374
+ chosen = sub.iloc[best_i]
375
  except Exception:
376
+ chosen = sub.iloc[0]
377
  else:
378
+ chosen = sub.iloc[0]
379
+
380
+ # convert chosen row to exposure map on universe
381
+ xs = _row_to_exposures(chosen, universe)
382
+ if xs is None:
383
+ return None
384
+ wmap = {t: float(xs[i]) for i, t in enumerate(universe) if abs(xs[i]) > 1e-4}
385
+ return {"weights": wmap,
386
+ "er": float(chosen["er_p"]),
387
+ "sigma": float(chosen["sigma_p"]),
388
+ "beta": float(chosen["beta_p"])}
389
+
390
+ def build_simple_suggestion_table(weights_exposure: Dict[str, float],
391
+ gross_capital: float,
392
+ top_n: int = 12) -> pd.DataFrame:
393
+ rows = []
394
+ for t, w in sorted(weights_exposure.items(), key=lambda kv: -abs(kv[1]))[:top_n]:
395
+ rows.append({
396
+ "ticker": t,
397
+ "weight_%": round(float(w) * 100.0, 2),
398
+ "dollars_$": round(float(w) * float(gross_capital), 0)
399
+ })
400
+ return pd.DataFrame(rows, columns=["ticker", "weight_%", "dollars_$"])
401
+
402
+ # -------------- summary builder --------------
403
+ def fmt_pct(x: float) -> str:
404
+ return f"{x*100:.2f}%"
405
+
406
  def build_summary_md(lookback, horizon, rf, rf_code, erp, sigma_mkt,
407
  beta_p, er_p, sigma_p,
408
  a_sigma, b_sigma, mu_eff_sigma,
409
  a_mu, b_mu, sigma_eff_mu,
410
+ sugg=None, risk_label=None) -> str:
 
411
  lines = []
412
  lines.append("### Inputs")
413
+ lines.append(f"- Lookback years: **{lookback}**")
414
  lines.append(f"- Horizon years: **{int(round(horizon))}**")
415
+ lines.append(f"- Risk-free: **{fmt_pct(rf)}** (FRED {rf_code})")
416
  lines.append(f"- Market ERP: **{fmt_pct(erp)}**")
417
  lines.append(f"- Market σ: **{fmt_pct(sigma_mkt)}**")
418
  lines.append("")
 
422
  lines.append(f"- Expected return: **{fmt_pct(er_p)}**")
423
  lines.append("")
424
  lines.append("### Efficient alternatives on CML")
425
+ lines.append("Same σ as your portfolio")
426
+ lines.append(f"- Market weight **{a_sigma:.2f}**, Bills weight **{b_sigma:.2f}**")
427
+ lines.append(f"- Expected return **{fmt_pct(mu_eff_sigma)}**")
428
+ lines.append("Same μ as your portfolio")
429
+ lines.append(f"- Market weight **{a_mu:.2f}**, Bills weight **{b_mu:.2f}**")
430
+ lines.append(f"- σ **{fmt_pct(sigma_eff_mu)}**")
431
+ if sugg is not None:
432
+ lines.append("")
433
+ lines.append(f"### Dataset-based suggestion (risk: **{risk_label}**)")
434
+ lines.append(f"- Suggested β **{sugg['beta']:.2f}**, σ **{fmt_pct(sugg['sigma'])}**, μ **{fmt_pct(sugg['er'])}**")
435
  return "\n".join(lines)
436
 
437
+ # -------------- global state --------------
438
+ UNIVERSE = [MARKET_TICKER, "QQQ", "XLK", "XLP", "XLE", "VNQ", "IEF", "HYG", "GLD", "EEM"]
439
+ HORIZON_YEARS = 10
440
+ RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
441
+ RF_ANN = fetch_fred_yield_annual(RF_CODE)
442
+
443
+ # -------------- gradio callbacks --------------
444
  def search_tickers_cb(q: str):
445
  hits = yahoo_search(q)
446
  if not hits:
 
487
  HORIZON_YEARS = y
488
  RF_CODE = code
489
  RF_ANN = rf
490
+ return f"Risk-free series {code}. Latest annual rate {rf:.2%}. Will be used on compute."
491
+
492
+ def compute(lookback: int,
493
+ table: pd.DataFrame,
494
+ risk_label: str,
495
+ use_embeddings: bool):
496
+
497
+ if table is None or len(table) == 0:
498
+ return None, "Add at least one ticker", "Universe empty", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=["ticker","weight_%","dollars_$"]), None
499
+
500
+ df = table.dropna().copy()
501
  df["ticker"] = df["ticker"].astype(str).str.upper().str.strip()
502
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
503
 
504
  symbols = [t for t in df["ticker"].tolist() if t]
505
+ symbols = validate_tickers(symbols, lookback)
506
  if len(symbols) == 0:
507
+ return None, "Could not validate any tickers", "Universe invalid", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=["ticker","weight_%","dollars_$"]), None
 
 
 
 
508
 
509
+ global UNIVERSE
510
+ UNIVERSE = list(sorted(set([s for s in symbols])))[:MAX_TICKERS]
511
 
512
+ # amounts & gross (gross = sum of absolute exposures)
513
+ amounts = {r["ticker"]: float(r["amount_usd"]) for _, r in df.iterrows() if r["ticker"] in UNIVERSE}
514
+ gross = float(sum(abs(v) for v in amounts.values()))
515
  rf_ann = RF_ANN
516
 
517
+ # aligned moments
518
+ moms = estimate_all_moments_aligned(UNIVERSE, lookback, rf_ann)
519
  betas, covA, erp_ann, sigma_mkt = moms["betas"], moms["cov_ann"], moms["erp_ann"], moms["sigma_m_ann"]
520
 
521
+ if gross <= 1e-12:
522
+ return None, "All amounts are zero", f"Universe set to: {', '.join(UNIVERSE)}", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=["ticker","weight_%","dollars_$"]), None
 
 
523
 
524
+ weights = {k: v / gross for k, v in amounts.items()}
525
  beta_p, er_p, sigma_p = portfolio_stats(weights, covA, betas, rf_ann, erp_ann)
526
 
527
  a_sigma, b_sigma, mu_eff_sigma = efficient_same_sigma(sigma_p, rf_ann, erp_ann, sigma_mkt)
528
  a_mu, b_mu, sigma_eff_mu = efficient_same_return(er_p, rf_ann, erp_ann, sigma_mkt)
529
 
530
+ # build (or reuse) synthetic dataset for this universe
531
+ csv_path = None
532
+ # make a stable filename per-universe
533
+ sig = hashlib.md5((",".join(sorted(UNIVERSE)) + f":{lookback}:{RF_CODE}").encode()).hexdigest()[:8]
534
+ candidate_path = os.path.join(DATA_DIR, f"investor_profiles_{sig}.csv")
535
+ if os.path.exists(candidate_path):
536
+ csv_path = candidate_path
537
+ else:
538
+ synth_df = build_synthetic_dataset(UNIVERSE, years=lookback, rf_ann=rf_ann, erp_ann=erp_ann)
539
+ csv_path = save_synth_csv(synth_df, UNIVERSE)
540
+
541
+ # dataset-based suggestion by risk
542
+ sug = pick_by_risk_from_dataset(csv_path, UNIVERSE, risk_label=risk_label, use_embeddings=use_embeddings)
543
+ suggestion_df = pd.DataFrame(columns=["ticker","weight_%","dollars_$"])
544
+ sugg_sigma_plot = None
545
+ sugg_mu_plot = None
546
+ if sug is not None:
547
+ suggestion_df = build_simple_suggestion_table(sug["weights"], gross_capital=gross)
548
+ sugg_sigma_plot = sug["sigma"]
549
+ sugg_mu_plot = sug["er"]
550
+
551
+ # positions table (computed from user's inputs)
552
+ rows = []
553
+ for t in UNIVERSE:
554
+ if t in amounts:
555
+ beta_val = 1.0 if t == moms["mkt"] else betas.get(t, np.nan)
556
+ rows.append({
557
+ "ticker": t,
558
+ "amount_usd": float(amounts.get(t, 0.0)),
559
+ "weight_exposure": float(weights.get(t, 0.0)),
560
+ "beta": float(beta_val),
561
+ })
562
+ pos_table = pd.DataFrame(rows, columns=POS_COLS)
563
 
564
+ # plot & summary
565
+ img = plot_cml(
566
  rf_ann, erp_ann, sigma_mkt,
567
  sigma_p, er_p,
568
  sigma_p, mu_eff_sigma,
569
  sigma_eff_mu, er_p,
570
+ sugg_sigma=sugg_sigma_plot, sugg_mu=sugg_mu_plot
571
  )
572
 
 
573
  info = build_summary_md(
574
+ lookback, HORIZON_YEARS, rf_ann, RF_CODE, erp_ann, sigma_mkt,
575
  beta_p, er_p, sigma_p,
576
  a_sigma, b_sigma, mu_eff_sigma,
577
  a_mu, b_mu, sigma_eff_mu,
578
+ sugg=sug, risk_label=risk_label
 
579
  )
580
 
581
+ uni_msg = f"Universe set to: {', '.join(UNIVERSE)}"
582
+ return img, info, uni_msg, pos_table, suggestion_df, csv_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583
 
584
+ # -------------- UI --------------
585
  with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
586
  gr.Markdown(
587
  "## Efficient Portfolio Advisor\n"
588
+ "Search symbols, enter dollar amounts, set your horizon. Prices from Yahoo Finance. Risk-free from FRED. "
589
+ "Low/Medium/High suggestions are chosen only from a 1,000-row dataset generated from your current universe, "
590
+ "optionally refined with finance embeddings."
 
591
  )
592
 
593
  with gr.Row():
 
604
  headers=["ticker", "amount_usd"],
605
  datatype=["str", "number"],
606
  row_count=0,
607
+ col_count=(2, "fixed"),
608
+ wrap=True,
609
  )
610
 
611
+ horizon = gr.Number(label="Horizon in years (1–100)", value=HORIZON_YEARS, precision=0)
612
  lookback = gr.Slider(1, 10, value=DEFAULT_LOOKBACK_YEARS, step=1, label="Lookback years for beta & sigma")
613
 
614
  gr.Markdown("### Suggestions")
615
+ risk = gr.Radio(choices=["Low", "Medium", "High"], value="Medium", label="Risk tolerance")
616
+ use_emb = gr.Checkbox(label="Use finance embeddings to refine picks", value=True)
617
 
618
  run_btn = gr.Button("Compute (build dataset & suggest)")
619
 
620
  with gr.Column(scale=1):
621
  plot = gr.Image(label="Capital Market Line (CML)", type="pil")
622
+ summary = gr.Markdown(label="Inputs & Results")
623
  universe_msg = gr.Textbox(label="Universe status", interactive=False)
624
+
625
  positions = gr.Dataframe(
626
  label="Computed positions",
627
  headers=POS_COLS,
 
630
  value=pd.DataFrame(columns=POS_COLS),
631
  interactive=False
632
  )
633
+
634
  suggestions = gr.Dataframe(
635
+ label="Suggested holdings (weights are % of gross capital; negatives = shorts)",
636
+ headers=["ticker", "weight_%", "dollars_$"],
637
+ datatype=["str", "number", "number"],
638
+ col_count=(3, "fixed"),
639
+ value=pd.DataFrame(columns=["ticker","weight_%","dollars_$"]),
640
  interactive=False
641
  )
642
+
643
  dl = gr.File(label="Generated dataset CSV", value=None, visible=True)
644
 
 
645
  def do_search(query):
646
  note, options = search_tickers_cb(query)
647
  return note, gr.update(choices=options)
 
652
  horizon.change(fn=set_horizon, inputs=horizon, outputs=universe_msg)
653
 
654
  run_btn.click(
655
+ fn=compute,
656
+ inputs=[lookback, table, risk, use_emb],
657
  outputs=[plot, summary, universe_msg, positions, suggestions, dl]
658
  )
659
 
660
  if __name__ == "__main__":
661
  demo.launch()
662
+ ```