hskwon7 commited on
Commit
f0e4a41
Β·
verified Β·
1 Parent(s): 30de4d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -29
app.py CHANGED
@@ -1,64 +1,120 @@
 
1
  import streamlit as st
 
 
 
 
2
  from sentence_transformers import SentenceTransformer
3
  import faiss
4
  from transformers import AutoTokenizer, AutoModelForTokenClassification
5
- import modules
6
- import torch
7
 
8
- # ─── CACHED RESOURCES ────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  @st.cache_resource(show_spinner=False)
10
- def load_search_index():
11
- # 1. Load the SentenceTransformer
 
 
 
 
 
 
 
 
 
12
  repo_name = "hskwon7/paraphrase-MiniLM-L6-v2-ft-for-etf-semantic-search"
13
- model = SentenceTransformer(repo_name)
14
 
15
- # 2. Build your document texts & embeddings
16
- df_etf, *_ = modules.load_etf_data()
17
- df_etf["doc"] = df_etf.apply(modules.make_doc_text, axis=1)
18
  ticker_list = df_etf["Ticker"].tolist()
19
- embs = model.encode(df_etf["doc"].tolist(), convert_to_numpy=True) # no progress bar
20
 
21
- # 3. Create FAISS index once
22
- faiss.normalize_L2(embs)
23
- index = faiss.IndexFlatIP(embs.shape[1])
24
- index.add(embs)
 
 
 
 
 
 
 
25
 
26
  return model, index, ticker_list
27
 
28
  @st.cache_resource(show_spinner=False)
29
  def load_ner_models():
30
- # Load your two NER pipelines
31
- repo1, repo2 = (
32
- "hskwon7/distilbert-base-uncased-for-etf-ticker",
33
- "hskwon7/albert-base-v2-for-etf-ticker",
34
- )
 
35
  tok1 = AutoTokenizer.from_pretrained(repo1)
36
  m1 = AutoModelForTokenClassification.from_pretrained(repo1)
37
  tok2 = AutoTokenizer.from_pretrained(repo2)
38
  m2 = AutoModelForTokenClassification.from_pretrained(repo2)
39
 
40
- # valid tickers
41
- df_etf, *_ = modules.load_etf_data()
42
- valid = set(t.upper() for t in df_etf["Ticker"].unique())
43
 
44
- return (tok1, m1), (tok2, m2), valid
45
 
46
- # ─── IN YOUR APP ────────────────────────────────────────────────────────────
47
 
48
- # at the top of your script, instead of re-loading inline:
49
- s2_model, faiss_index, etf_list = load_search_index()
 
 
50
  (tok1, m1), (tok2, m2), valid_ticker_set = load_ner_models()
51
 
52
- def semantic_search(query, top_k=100):
 
 
 
 
 
53
  q_emb = s2_model.encode([query], convert_to_numpy=True)
54
  faiss.normalize_L2(q_emb)
55
  D, I = faiss_index.search(q_emb, top_k)
56
  return [(etf_list[idx], float(D[0][i])) for i, idx in enumerate(I[0])]
57
 
58
- def ensemble_ticker_extraction(query):
 
 
 
 
59
  preds = set()
60
  for tok, mdl in ((tok1, m1), (tok2, m2)):
61
- enc = tok(query, return_tensors="pt")
62
  with torch.no_grad():
63
  logits = mdl(**enc).logits
64
  ids = logits.argmax(dim=-1)[0].tolist()
 
1
+ import os
2
  import streamlit as st
3
+ import uuid
4
+ import pandas as pd
5
+ import modules
6
+ import torch
7
  from sentence_transformers import SentenceTransformer
8
  import faiss
9
  from transformers import AutoTokenizer, AutoModelForTokenClassification
 
 
10
 
11
+ # ─── CACHES ─────────────────────────────────────────────────────────────────
12
+
13
+ @st.cache_data(show_spinner=False)
14
+ def load_etf_data():
15
+ """
16
+ Load ETF data with a persistent 'doc' column.
17
+
18
+ - On first run: reads raw CSV, computes 'doc', saves enriched CSV.
19
+ - On subsequent runs: loads enriched CSV directly.
20
+ """
21
+ enriched_path = "etf_general_info_enriched_doc_added.csv"
22
+ raw_path = "etf_general_info_enriched.csv"
23
+
24
+ if os.path.exists(enriched_path):
25
+ df_info = pd.read_csv(enriched_path)
26
+ else:
27
+ df_info = pd.read_csv(raw_path).rename(columns={"ticker": "Ticker"})
28
+ df_info["doc"] = df_info.apply(modules.make_doc_text, axis=1)
29
+ df_info.to_csv(enriched_path, index=False)
30
+
31
+ # Split into DataFrame and ticker list
32
+ df_etf, available_tickers = modules.set_etf_data(df_info)
33
+
34
+ # Load other supporting DataFrames
35
+ df_analyst_report = pd.read_csv("etf_analyst_report_full.csv")
36
+ df_annual_return_master = (
37
+ pd.read_csv("annual_return.csv").rename(columns={"ticker": "Ticker"})
38
+ )
39
+ return df_etf, df_analyst_report, available_tickers, df_annual_return_master
40
+
41
  @st.cache_resource(show_spinner=False)
42
+ def build_search_resources():
43
+ """
44
+ Loads (or builds) SentenceTransformer + FAISS index + ticker list.
45
+
46
+ - On first run: computes embeddings, builds index, writes to disk.
47
+ - On subsequent runs: loads FAISS index from disk.
48
+ """
49
+ # 1) Ensure ETF data (with 'doc') is loaded
50
+ df_etf, _, _, _ = load_etf_data()
51
+
52
+ # 2) Load SentenceTransformer model
53
  repo_name = "hskwon7/paraphrase-MiniLM-L6-v2-ft-for-etf-semantic-search"
54
+ model = SentenceTransformer(repo_name)
55
 
56
+ # 3) Prepare list of tickers
 
 
57
  ticker_list = df_etf["Ticker"].tolist()
 
58
 
59
+ # 4) Persist / load FAISS index
60
+ index_path = "etf_faiss.index"
61
+ if os.path.exists(index_path):
62
+ index = faiss.read_index(index_path)
63
+ else:
64
+ docs = df_etf["doc"].tolist()
65
+ embs = model.encode(docs, convert_to_numpy=True)
66
+ faiss.normalize_L2(embs)
67
+ index = faiss.IndexFlatIP(embs.shape[1])
68
+ index.add(embs)
69
+ faiss.write_index(index, index_path)
70
 
71
  return model, index, ticker_list
72
 
73
  @st.cache_resource(show_spinner=False)
74
  def load_ner_models():
75
+ """
76
+ Loads two NER models for ticker extraction, and builds the valid ticker set.
77
+ """
78
+ repo1 = "hskwon7/distilbert-base-uncased-for-etf-ticker"
79
+ repo2 = "hskwon7/albert-base-v2-for-etf-ticker"
80
+
81
  tok1 = AutoTokenizer.from_pretrained(repo1)
82
  m1 = AutoModelForTokenClassification.from_pretrained(repo1)
83
  tok2 = AutoTokenizer.from_pretrained(repo2)
84
  m2 = AutoModelForTokenClassification.from_pretrained(repo2)
85
 
86
+ df_etf, *_ = load_etf_data()
87
+ valid_ticker_set = set(t.upper() for t in df_etf["Ticker"].unique())
 
88
 
89
+ return (tok1, m1), (tok2, m2), valid_ticker_set
90
 
91
+ # ─── INITIALIZE CACHED RESOURCES ─────────────────────────────────────────────
92
 
93
+ # Load data & models once per session
94
+
95
+ df_etf, df_analyst_report, available_tickers, df_annual_return_master = load_etf_data()
96
+ s2_model, faiss_index, etf_list = build_search_resources()
97
  (tok1, m1), (tok2, m2), valid_ticker_set = load_ner_models()
98
 
99
+ # ─── CORE SEARCH & EXTRACTION ─────────────────────────────────────────────────
100
+
101
+ def semantic_search(query: str, top_k: int = 100):
102
+ """
103
+ Returns a list of (ticker, score) tuples from FAISS semantic search.
104
+ """
105
  q_emb = s2_model.encode([query], convert_to_numpy=True)
106
  faiss.normalize_L2(q_emb)
107
  D, I = faiss_index.search(q_emb, top_k)
108
  return [(etf_list[idx], float(D[0][i])) for i, idx in enumerate(I[0])]
109
 
110
+
111
+ def ensemble_ticker_extraction(query: str):
112
+ """
113
+ Runs two NER models in ensemble to extract possible ETF tickers.
114
+ """
115
  preds = set()
116
  for tok, mdl in ((tok1, m1), (tok2, m2)):
117
+ enc = tok(query, return_tensors="pt")
118
  with torch.no_grad():
119
  logits = mdl(**enc).logits
120
  ids = logits.argmax(dim=-1)[0].tolist()