Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,64 +1,120 @@
|
|
|
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from sentence_transformers import SentenceTransformer
|
| 3 |
import faiss
|
| 4 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 5 |
-
import modules
|
| 6 |
-
import torch
|
| 7 |
|
| 8 |
-
# βββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
@st.cache_resource(show_spinner=False)
|
| 10 |
-
def
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
repo_name = "hskwon7/paraphrase-MiniLM-L6-v2-ft-for-etf-semantic-search"
|
| 13 |
-
model
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
df_etf, *_ = modules.load_etf_data()
|
| 17 |
-
df_etf["doc"] = df_etf.apply(modules.make_doc_text, axis=1)
|
| 18 |
ticker_list = df_etf["Ticker"].tolist()
|
| 19 |
-
embs = model.encode(df_etf["doc"].tolist(), convert_to_numpy=True) # no progress bar
|
| 20 |
|
| 21 |
-
#
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
return model, index, ticker_list
|
| 27 |
|
| 28 |
@st.cache_resource(show_spinner=False)
|
| 29 |
def load_ner_models():
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
| 35 |
tok1 = AutoTokenizer.from_pretrained(repo1)
|
| 36 |
m1 = AutoModelForTokenClassification.from_pretrained(repo1)
|
| 37 |
tok2 = AutoTokenizer.from_pretrained(repo2)
|
| 38 |
m2 = AutoModelForTokenClassification.from_pretrained(repo2)
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
valid = set(t.upper() for t in df_etf["Ticker"].unique())
|
| 43 |
|
| 44 |
-
return (tok1, m1), (tok2, m2),
|
| 45 |
|
| 46 |
-
# βββ
|
| 47 |
|
| 48 |
-
#
|
| 49 |
-
|
|
|
|
|
|
|
| 50 |
(tok1, m1), (tok2, m2), valid_ticker_set = load_ner_models()
|
| 51 |
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
q_emb = s2_model.encode([query], convert_to_numpy=True)
|
| 54 |
faiss.normalize_L2(q_emb)
|
| 55 |
D, I = faiss_index.search(q_emb, top_k)
|
| 56 |
return [(etf_list[idx], float(D[0][i])) for i, idx in enumerate(I[0])]
|
| 57 |
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
preds = set()
|
| 60 |
for tok, mdl in ((tok1, m1), (tok2, m2)):
|
| 61 |
-
enc
|
| 62 |
with torch.no_grad():
|
| 63 |
logits = mdl(**enc).logits
|
| 64 |
ids = logits.argmax(dim=-1)[0].tolist()
|
|
|
|
| 1 |
+
import os
|
| 2 |
import streamlit as st
|
| 3 |
+
import uuid
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import modules
|
| 6 |
+
import torch
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
import faiss
|
| 9 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
# βββ CACHES βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 12 |
+
|
| 13 |
+
@st.cache_data(show_spinner=False)
|
| 14 |
+
def load_etf_data():
|
| 15 |
+
"""
|
| 16 |
+
Load ETF data with a persistent 'doc' column.
|
| 17 |
+
|
| 18 |
+
- On first run: reads raw CSV, computes 'doc', saves enriched CSV.
|
| 19 |
+
- On subsequent runs: loads enriched CSV directly.
|
| 20 |
+
"""
|
| 21 |
+
enriched_path = "etf_general_info_enriched_doc_added.csv"
|
| 22 |
+
raw_path = "etf_general_info_enriched.csv"
|
| 23 |
+
|
| 24 |
+
if os.path.exists(enriched_path):
|
| 25 |
+
df_info = pd.read_csv(enriched_path)
|
| 26 |
+
else:
|
| 27 |
+
df_info = pd.read_csv(raw_path).rename(columns={"ticker": "Ticker"})
|
| 28 |
+
df_info["doc"] = df_info.apply(modules.make_doc_text, axis=1)
|
| 29 |
+
df_info.to_csv(enriched_path, index=False)
|
| 30 |
+
|
| 31 |
+
# Split into DataFrame and ticker list
|
| 32 |
+
df_etf, available_tickers = modules.set_etf_data(df_info)
|
| 33 |
+
|
| 34 |
+
# Load other supporting DataFrames
|
| 35 |
+
df_analyst_report = pd.read_csv("etf_analyst_report_full.csv")
|
| 36 |
+
df_annual_return_master = (
|
| 37 |
+
pd.read_csv("annual_return.csv").rename(columns={"ticker": "Ticker"})
|
| 38 |
+
)
|
| 39 |
+
return df_etf, df_analyst_report, available_tickers, df_annual_return_master
|
| 40 |
+
|
| 41 |
@st.cache_resource(show_spinner=False)
|
| 42 |
+
def build_search_resources():
|
| 43 |
+
"""
|
| 44 |
+
Loads (or builds) SentenceTransformer + FAISS index + ticker list.
|
| 45 |
+
|
| 46 |
+
- On first run: computes embeddings, builds index, writes to disk.
|
| 47 |
+
- On subsequent runs: loads FAISS index from disk.
|
| 48 |
+
"""
|
| 49 |
+
# 1) Ensure ETF data (with 'doc') is loaded
|
| 50 |
+
df_etf, _, _, _ = load_etf_data()
|
| 51 |
+
|
| 52 |
+
# 2) Load SentenceTransformer model
|
| 53 |
repo_name = "hskwon7/paraphrase-MiniLM-L6-v2-ft-for-etf-semantic-search"
|
| 54 |
+
model = SentenceTransformer(repo_name)
|
| 55 |
|
| 56 |
+
# 3) Prepare list of tickers
|
|
|
|
|
|
|
| 57 |
ticker_list = df_etf["Ticker"].tolist()
|
|
|
|
| 58 |
|
| 59 |
+
# 4) Persist / load FAISS index
|
| 60 |
+
index_path = "etf_faiss.index"
|
| 61 |
+
if os.path.exists(index_path):
|
| 62 |
+
index = faiss.read_index(index_path)
|
| 63 |
+
else:
|
| 64 |
+
docs = df_etf["doc"].tolist()
|
| 65 |
+
embs = model.encode(docs, convert_to_numpy=True)
|
| 66 |
+
faiss.normalize_L2(embs)
|
| 67 |
+
index = faiss.IndexFlatIP(embs.shape[1])
|
| 68 |
+
index.add(embs)
|
| 69 |
+
faiss.write_index(index, index_path)
|
| 70 |
|
| 71 |
return model, index, ticker_list
|
| 72 |
|
| 73 |
@st.cache_resource(show_spinner=False)
|
| 74 |
def load_ner_models():
|
| 75 |
+
"""
|
| 76 |
+
Loads two NER models for ticker extraction, and builds the valid ticker set.
|
| 77 |
+
"""
|
| 78 |
+
repo1 = "hskwon7/distilbert-base-uncased-for-etf-ticker"
|
| 79 |
+
repo2 = "hskwon7/albert-base-v2-for-etf-ticker"
|
| 80 |
+
|
| 81 |
tok1 = AutoTokenizer.from_pretrained(repo1)
|
| 82 |
m1 = AutoModelForTokenClassification.from_pretrained(repo1)
|
| 83 |
tok2 = AutoTokenizer.from_pretrained(repo2)
|
| 84 |
m2 = AutoModelForTokenClassification.from_pretrained(repo2)
|
| 85 |
|
| 86 |
+
df_etf, *_ = load_etf_data()
|
| 87 |
+
valid_ticker_set = set(t.upper() for t in df_etf["Ticker"].unique())
|
|
|
|
| 88 |
|
| 89 |
+
return (tok1, m1), (tok2, m2), valid_ticker_set
|
| 90 |
|
| 91 |
+
# βββ INITIALIZE CACHED RESOURCES βββββββββββββββββββββββββββββββββββββββββββββ
|
| 92 |
|
| 93 |
+
# Load data & models once per session
|
| 94 |
+
|
| 95 |
+
df_etf, df_analyst_report, available_tickers, df_annual_return_master = load_etf_data()
|
| 96 |
+
s2_model, faiss_index, etf_list = build_search_resources()
|
| 97 |
(tok1, m1), (tok2, m2), valid_ticker_set = load_ner_models()
|
| 98 |
|
| 99 |
+
# βββ CORE SEARCH & EXTRACTION βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 100 |
+
|
| 101 |
+
def semantic_search(query: str, top_k: int = 100):
|
| 102 |
+
"""
|
| 103 |
+
Returns a list of (ticker, score) tuples from FAISS semantic search.
|
| 104 |
+
"""
|
| 105 |
q_emb = s2_model.encode([query], convert_to_numpy=True)
|
| 106 |
faiss.normalize_L2(q_emb)
|
| 107 |
D, I = faiss_index.search(q_emb, top_k)
|
| 108 |
return [(etf_list[idx], float(D[0][i])) for i, idx in enumerate(I[0])]
|
| 109 |
|
| 110 |
+
|
| 111 |
+
def ensemble_ticker_extraction(query: str):
|
| 112 |
+
"""
|
| 113 |
+
Runs two NER models in ensemble to extract possible ETF tickers.
|
| 114 |
+
"""
|
| 115 |
preds = set()
|
| 116 |
for tok, mdl in ((tok1, m1), (tok2, m2)):
|
| 117 |
+
enc = tok(query, return_tensors="pt")
|
| 118 |
with torch.no_grad():
|
| 119 |
logits = mdl(**enc).logits
|
| 120 |
ids = logits.argmax(dim=-1)[0].tolist()
|