ISOM5240-Final-Assignment

Sleeping

App Files Files Community

ISOM5240-Final-Assignment / app.py

hskwon7

Update app.py

e5c69a4 verified 9 months ago

raw

history blame contribute delete

15.9 kB

	import os
	import streamlit as st
	import uuid
	import pandas as pd
	import modules
	import torch
	from sentence_transformers import SentenceTransformer
	import faiss
	from transformers import AutoTokenizer, AutoModelForTokenClassification
	import re

	# ─── CACHES ─────────────────────────────────────────────────────────────────

	@st.cache_data(show_spinner=False)
	def load_etf_data():
	enriched_path = "etf_general_info_enriched_doc_added.csv"
	raw_path = "etf_general_info_enriched.csv"
	if os.path.exists(enriched_path):
	df_info = pd.read_csv(enriched_path).rename(columns={"ticker": "Ticker"})
	else:
	df_info = pd.read_csv(raw_path).rename(columns={"ticker": "Ticker"})
	df_info["doc"] = df_info.apply(modules.make_doc_text, axis=1)
	df_info.to_csv(enriched_path, index=False)
	df_etf_holdings = pd.read_csv('etf_holdings_summarized.csv').rename(columns={'ticker': 'Ticker',
	'holdingInformation': 'Holdings'})
	df_info = df_info.merge(df_etf_holdings, how='left', on='Ticker')
	df_etf, available_tickers = modules.set_etf_data(df_info)
	df_analyst_report = pd.read_csv("etf_analyst_report_full.csv").rename(columns={"ticker": "Ticker"})
	df_annual_return_master = (
	pd.read_csv("annual_return.csv")
	.rename(columns={"ticker": "Ticker"})
	)
	return df_etf, df_analyst_report, available_tickers, df_annual_return_master

	@st.cache_resource(show_spinner=False)
	def build_search_resources():
	df_etf, *_ = load_etf_data()
	model = SentenceTransformer(
	"hskwon7/paraphrase-MiniLM-L6-v2-ft-for-etf-semantic-search"
	)
	ticker_list = df_etf["Ticker"].tolist()

	idx_path = "etf_faiss.index"
	if os.path.exists(idx_path):
	index = faiss.read_index(idx_path)
	else:
	embs = model.encode(df_etf["doc"].tolist(), convert_to_numpy=True)
	faiss.normalize_L2(embs)
	index = faiss.IndexFlatIP(embs.shape[1])
	index.add(embs)
	faiss.write_index(index, idx_path)

	return model, index, ticker_list

	@st.cache_resource(show_spinner=False)
	def load_ner_models():
	tok1, m1 = (
	AutoTokenizer.from_pretrained("hskwon7/distilbert-base-uncased-for-etf-ticker"),
	AutoModelForTokenClassification.from_pretrained("hskwon7/distilbert-base-uncased-for-etf-ticker")
	)
	tok2, m2 = (
	AutoTokenizer.from_pretrained("hskwon7/albert-base-v2-for-etf-ticker"),
	AutoModelForTokenClassification.from_pretrained("hskwon7/albert-base-v2-for-etf-ticker")
	)
	df_etf, *_ = load_etf_data()
	valid_ticker_set = set(df_etf["Ticker"].str.upper())
	return (tok1, m1), (tok2, m2), valid_ticker_set

	# ─── INITIALIZE ─────────────────────────────────────────────────────────────

	df_etf, df_analyst_report, available_tickers, df_annual_return_master = load_etf_data()
	s2_model, faiss_index, etf_list = build_search_resources()
	(tok1, m1), (tok2, m2), valid_ticker_set = load_ner_models()

	# ─── CORE ROUTINES ──────────────────────────────────────────────────────────

	# Semantic Search
	def semantic_search(q: str, top_k: int=500):
	emb = s2_model.encode([q], convert_to_numpy=True)
	faiss.normalize_L2(emb)
	D, I = faiss_index.search(emb, top_k)
	l_fetched_etf_score_tuples = [(etf_list[i], float(D[0][j])) for j,i in enumerate(I[0])]
	# return only the tickers
	return [t for t, _ in l_fetched_etf_score_tuples]

	# Ensemble function: union of both models' predictions
	def ensemble_ticker_extraction(query):
	preds = set()

	for tok, mdl in ((tok1,m1),(tok2,m2)):
	enc = tok(query, return_tensors="pt")
	with torch.no_grad():
	logits = mdl(**enc).logits
	pred_ids = logits.argmax(dim=-1)[0].tolist()
	tokens = tok.convert_ids_to_tokens(enc["input_ids"][0])
	labels = [mdl.config.id2label[i] for i in pred_ids]
	preds.update(modules.extract_valid_tickers(tokens, labels, tok, valid_ticker_set))

	return preds

	# Rule-based fallback: catch literal 2–4 char tickers in the text
	def rule_fallback(query, valid_set):
	words = re.findall(r"\b[A-Za-z0-9]{2,4}\b", query)
	return {w.upper() for w in words if w.upper() in valid_set}

	# ─── UI HELPERS ─────────────────────────────────────────────────────────────

	def display_sample_query_boxes(key_prefix=""):
	sample_queries = {
	"search_etf": {
	"title": "ETF Search",
	"description": "Explore ETFs based on criteria such as high dividends, low expense ratios, or sector focus.",
	"query": [
	'High-dividend ETFs in the tech sector.',
	'Precious metals ETFs with low expense ratio.',
	'Large growth ETFs with high returns.'
	]
	},
	"comparison": {
	"title": "ETF Performance Comparison",
	"description": "Compare two ETFs side by side to evaluate their performance, risk, and other metrics.",
	"query": [
	"I'd like to compare performance of QQQ with GLD.",
	"Compare SPY and VOO.",
	"SCHD vs. VTI"
	]
	},
	"portfolio_projection": {
	"title": "Portfolio Projection",
	"description": "Project a portfolio with your choice of ETFs over 30 years.",
	"query": [
	"I want to invest in SPY, QQQ, SCHD, and IAU.",
	"Portfolio projection for VTI, XLF, and XLY."
	]
	},
	}

	cols = st.columns(len(sample_queries))
	title_h, desc_h, query_h = "30px", "60px", "70px"

	for idx, (key, details) in enumerate(sample_queries.items()):
	with cols[idx]:
	st.markdown(f"""
	<div style="
	width:100%; height:350; border:1px solid #ddd;
	border-radius:10px; padding:15px; margin:auto;
	display:flex; flex-direction:column; justify-content:space-between;
	box-shadow:2px 2px 8px rgba(0,0,0,0.1);
	">
	<div style="height:{title_h}; text-align:center;">
	<b style="font-size:16px; color:#2c3e50;">
	{details['title']}
	</b>
	</div>
	<div style="height:{desc_h}; text-align:center; color:#7f8c8d; font-size:14px; overflow:auto;">
	{details['description']}
	</div>
	<div style="height:{query_h}; text-align:center; color:#34495e; font-size:13px; font-style:italic; overflow:auto;">
	{'<br>'.join(f'“{q}”' for q in details['query'])}
	</div>
	</div>
	""", unsafe_allow_html=True)

	# center the button directly under the box
	st.markdown("<div style='text-align:center; margin-top:10px;'>", unsafe_allow_html=True)
	if st.button("Go to this app", key=key_prefix+key):
	page_map = {
	"search_etf": "ETF Search",
	"comparison": "ETF Comparison",
	"portfolio_projection": "ETF Portfolio"
	}
	st.session_state["page"] = page_map[key]
	st.rerun()
	st.markdown("</div>", unsafe_allow_html=True)

	def display_chat_history(task: str):
	for entry in st.session_state.get(f"all_chat_history_{task}", []):
	if entry.get("query"):
	st.chat_message("user").write(entry["query"])
	if entry.get("fig"):
	st.plotly_chart(entry["fig"], use_container_width=True)
	if entry.get("df") is not None:
	modules.display_matching_etfs(entry["df"])
	if entry.get("response"):
	st.chat_message("assistant").write(entry["response"])

	def process_query(task: str, query: str):
	# Define the number of ETFs to fetch and display
	top_k, top_n = 50, 20
	if task=="search_etf":
	# Display user query
	st.chat_message("user").write(query)

	# Store query in chat history
	st.session_state[f"all_chat_history_{task}"].append(
	modules.form_d_chat_history(str(uuid.uuid4()), None, task, df=None, query=query)
	)

	# Run semantic search
	with st.spinner("Hang on tight! Searching ETFs..."):
	fetched = semantic_search(query, top_k)

	# Get ETF data from the list of tickers
	df_out = modules.get_etf_recommendations_from_list(
	fetched, df_etf, top_n
	)

	# Generate response
	relavant_tickers = df_out['Ticker'].tolist()
	response = modules.format_etf_search_results_inline(relavant_tickers)

	# Display results
	st.markdown("### ETF Search Results")
	modules.display_matching_etfs(df_out)
	st.chat_message("assistant").write(response)

	# Store response in chat history
	st.session_state[f"all_chat_history_{task}"].append(
	modules.form_d_chat_history(str(uuid.uuid4()), response, task, df=df_out)
	)

	elif task=="comparison":
	# Display user query
	st.chat_message("user").write(query)

	# Store query in chat history
	st.session_state[f"all_chat_history_{task}"].append(
	modules.form_d_chat_history(str(uuid.uuid4()), None, task, df=None, query=query)
	)

	# Run comparison analysis
	with st.spinner("Hang on tight! Running comparison analysis..."):
	# Extarct tickers from query
	ensemble_preds = ensemble_ticker_extraction(query)
	fallback_preds = rule_fallback(query, valid_ticker_set)
	tk = list(sorted(ensemble_preds \| fallback_preds))

	# Check if exactly two tickers are provided
	if len(tk)!=2:
	response, fig, df_out = "Please specify exactly two tickers.", None, None
	else:
	# Get ETF data from the list of tickers
	df_out = modules.get_etf_recommendations_from_list(
	tk, df_etf, top_n=2
	)
	# Get performance comparison plot
	fig = modules.compare_etfs_interactive(tk[0], tk[1])

	# Generate response
	d_analyst_reports = modules.lookup_etf_report(tk, df_analyst_report=df_analyst_report)
	response = modules.format_insights_report(d_analyst_reports)

	# Display comparison
	st.markdown("### Performance Comparison")
	st.plotly_chart(fig, use_container_width=True)

	# Display Table
	modules.display_matching_etfs(df_out)

	# Return response
	st.chat_message("assistant").write(response)

	# Store response in chat history
	st.session_state[f"all_chat_history_{task}"].append(
	modules.form_d_chat_history(str(uuid.uuid4()), response, task, fig=fig, df=df_out)
	)

	elif task=="portfolio_projection":
	# Display user query
	st.chat_message("user").write(query)

	# Store query in chat history
	st.session_state[f"all_chat_history_{task}"].append(
	modules.form_d_chat_history(str(uuid.uuid4()), None, task, df=None, query=query)
	)

	# Run portfolio analysis
	with st.spinner("Hang on tight! Projecting portfolio ..."):
	# Extarct tickers from query
	ensemble_preds = ensemble_ticker_extraction(query)
	fallback_preds = rule_fallback(query, valid_ticker_set)
	tk = list(sorted(ensemble_preds \| fallback_preds))

	# Run portfolio analysis
	df_port_output, d_summary = modules.run_portfolio_analysis(tk, df_etf, df_annual_return_master)

	# Form a reprot
	response = modules.format_portfolio_summary(d_summary=d_summary)

	# Display projection
	fig = modules.portfolio_interactive_chart(df_port_output)
	st.markdown(f"### 30 Years Investment Return Projection")
	st.plotly_chart(fig, use_container_width=True)
	st.chat_message("assistant").write(response)

	# Store response in chat history
	st.session_state[f"all_chat_history_{task}"].append(
	modules.form_d_chat_history(str(uuid.uuid4()), response, task, fig=fig)
	)

	# ─── MAIN ────────────────────────────────────────────────────────────────

	def main():
	st.set_page_config(layout="wide")
	# init
	if "page" not in st.session_state:
	st.session_state["page"]="Home"
	for t in ["search_etf","comparison","portfolio_projection"]:
	st.session_state.setdefault(f"all_chat_history_{t}", [])

	# sidebar
	st.sidebar.title("ETF Assistant")
	if st.sidebar.button("🏠 Home"):
	st.session_state["page"]="Home"
	if st.sidebar.button("🔎 ETF Search"):
	st.session_state["page"]="ETF Search"
	if st.sidebar.button("⚖️ ETF Comparison"):
	st.session_state["page"]="ETF Comparison"
	if st.sidebar.button("💼 ETF Portfolio"):
	st.session_state["page"]="ETF Portfolio"

	# main page
	page = st.session_state["page"]
	st.title(page if page!="Home" else "ETF Assistant")

	# display content
	if page=="Home":
	# Home page
	st.header("How can I assist you today?")

	# Display introduction text 1
	etf_intro_text = "An exchange-traded fund (ETF) is an investment vehicle that holds a diversified basket of assets—such as stocks, bonds," \
	" or commodities—and trades on an exchange like a single stock. ETFs combine the diversification and low costs of mutual funds " \
	"with the flexibility and intraday liquidity of individual equities."
	st.write(etf_intro_text)

	# Display introduction text 2
	app_intro_text = "Find ETFs that align with your investment goals and sector interests, compare performance, and estimate your portfolio—all in one place!"
	st.write(app_intro_text)
	display_sample_query_boxes(key_prefix="home_")
	else:
	# Other pages
	task = {
	"ETF Search":"search_etf",
	"ETF Comparison":"comparison",
	"ETF Portfolio":"portfolio_projection"
	}[page]

	# Display introduction text
	app_description_text = {
	"ETF Search": "Explore ETFs based on criteria such as high dividends, low expense ratios, or sector focus.",
	"ETF Comparison": "Compare two ETFs side by side to evaluate their performance, risk, and other metrics.",
	"ETF Portfolio": "Project a portfolio with your choice of ETFs over 30 years."
	}[page]
	st.write(app_description_text)

	# Display all previous chat history
	display_chat_history(task)

	# Display input box
	q = st.chat_input({
	"ETF Search":"Search for ETFs…",
	"ETF Comparison":"Compare ETFs…",
	"ETF Portfolio":"Project portfolio…"
	}[page], key=task)

	# Process query
	if q:
	process_query(task, q)

	if __name__=="__main__":
	main()