Spaces:
Sleeping
Sleeping
Commit
·
61b2df0
1
Parent(s):
e25e756
introduced
Browse files- requirements.txt +3 -0
- src/ticker_finder.py +205 -0
requirements.txt
CHANGED
|
@@ -20,6 +20,7 @@ executing==2.2.0
|
|
| 20 |
fastapi==0.115.8
|
| 21 |
ffmpy==0.5.0
|
| 22 |
filelock==3.17.0
|
|
|
|
| 23 |
fonttools==4.55.8
|
| 24 |
frozendict==2.4.6
|
| 25 |
frozenlist==1.5.0
|
|
@@ -105,6 +106,7 @@ python-multipart==0.0.20
|
|
| 105 |
pytz==2025.1
|
| 106 |
PyYAML==6.0.2
|
| 107 |
pyzmq==26.2.1
|
|
|
|
| 108 |
regex==2024.11.6
|
| 109 |
requests==2.32.3
|
| 110 |
requests-toolbelt==1.0.0
|
|
@@ -125,6 +127,7 @@ stack-data==0.6.3
|
|
| 125 |
starlette==0.45.3
|
| 126 |
sympy==1.13.1
|
| 127 |
ta==0.11.0
|
|
|
|
| 128 |
tenacity==9.0.0
|
| 129 |
threadpoolctl==3.5.0
|
| 130 |
tiktoken==0.8.0
|
|
|
|
| 20 |
fastapi==0.115.8
|
| 21 |
ffmpy==0.5.0
|
| 22 |
filelock==3.17.0
|
| 23 |
+
-e git+https://github.com/OnurKerimoglu/financial_agents.git@e25e7566c95beb6678ff63f4c3fb7798e1f6ec27#egg=Financial_Agents
|
| 24 |
fonttools==4.55.8
|
| 25 |
frozendict==2.4.6
|
| 26 |
frozenlist==1.5.0
|
|
|
|
| 106 |
pytz==2025.1
|
| 107 |
PyYAML==6.0.2
|
| 108 |
pyzmq==26.2.1
|
| 109 |
+
RapidFuzz==3.12.1
|
| 110 |
regex==2024.11.6
|
| 111 |
requests==2.32.3
|
| 112 |
requests-toolbelt==1.0.0
|
|
|
|
| 127 |
starlette==0.45.3
|
| 128 |
sympy==1.13.1
|
| 129 |
ta==0.11.0
|
| 130 |
+
tabulate==0.9.0
|
| 131 |
tenacity==9.0.0
|
| 132 |
threadpoolctl==3.5.0
|
| 133 |
tiktoken==0.8.0
|
src/ticker_finder.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from rapidfuzz import process, fuzz
|
| 6 |
+
import requests
|
| 7 |
+
from tabulate import tabulate
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class TickerFinder():
|
| 11 |
+
"""
|
| 12 |
+
A class for finding the best matching ticker for a given company name or ticker.
|
| 13 |
+
Uses data from https://www.sec.gov/files/company_tickers.json and rapidfuzz package for fuzzy matching.
|
| 14 |
+
"""
|
| 15 |
+
def __init__(self):
|
| 16 |
+
"""
|
| 17 |
+
Initialize the TickerFinder object.
|
| 18 |
+
This method sets the file paths and reads the ticker data into the self.df attribute.
|
| 19 |
+
"""
|
| 20 |
+
self.logger = logging.getLogger(__name__)
|
| 21 |
+
self.rootdir = os.path.dirname(os.path.dirname(__file__))
|
| 22 |
+
self.fname_raw = os.path.join(self.rootdir, 'data_raw', 'sec_gov_company_tickers_test.json')
|
| 23 |
+
self.fname_compact = os.path.join(self.rootdir, 'data', 'sec_gov_company_tickers_compact.json')
|
| 24 |
+
self.df = self.read_ticker_data()
|
| 25 |
+
self.logger.info('Initialized TickerFinder object')
|
| 26 |
+
|
| 27 |
+
def read_ticker_data(
|
| 28 |
+
self
|
| 29 |
+
) -> pd.DataFrame:
|
| 30 |
+
"""
|
| 31 |
+
Read compact ticker data from a local file.
|
| 32 |
+
Returns
|
| 33 |
+
df : pandas DataFrame
|
| 34 |
+
"""
|
| 35 |
+
# if the compact data is not available, create it
|
| 36 |
+
if not os.path.exists(self.fname_compact):
|
| 37 |
+
self.logger.info(f'Compact ticker data was not found at {self.fname_compact}, creating it')
|
| 38 |
+
self.compact_ticker_data()
|
| 39 |
+
with open(self.fname_compact, 'r') as f:
|
| 40 |
+
data = json.load(f)
|
| 41 |
+
df = pd.DataFrame.from_dict(data, orient='columns')
|
| 42 |
+
self.logger.info(f'Read compact ticker data from {self.fname_compact}')
|
| 43 |
+
return df
|
| 44 |
+
|
| 45 |
+
def compact_ticker_data(
|
| 46 |
+
self
|
| 47 |
+
) -> None:
|
| 48 |
+
"""
|
| 49 |
+
Compact the raw ticker data by extracting only the ticker and title fields and
|
| 50 |
+
saving them to a local file.
|
| 51 |
+
If the raw data is not available, this method will download it first.
|
| 52 |
+
"""
|
| 53 |
+
if not os.path.exists(self.fname_raw):
|
| 54 |
+
self.logger.info(f'Raw ticker data was not found at {self.fname_raw}, downloading it')
|
| 55 |
+
self.download_ticker_data()
|
| 56 |
+
# read the raw data
|
| 57 |
+
with open(self.fname_raw, 'r') as f:
|
| 58 |
+
data = json.load(f)
|
| 59 |
+
# extract the necessary fields
|
| 60 |
+
titles = [None]*len(data)
|
| 61 |
+
tickers = [None]*len(data)
|
| 62 |
+
for k, v in data.items():
|
| 63 |
+
i = int(k)
|
| 64 |
+
titles[i] = v['title']
|
| 65 |
+
tickers[i] = v['ticker']
|
| 66 |
+
data_compact = {'ticker': tickers, 'title': titles}
|
| 67 |
+
# save the compact data
|
| 68 |
+
with open(self.fname_compact, 'w') as f:
|
| 69 |
+
json.dump(data_compact, f)
|
| 70 |
+
self.logger.info(f'Compacted raw ticker data into {self.fname_compact}')
|
| 71 |
+
|
| 72 |
+
def download_ticker_data(
|
| 73 |
+
self
|
| 74 |
+
) -> None:
|
| 75 |
+
"""
|
| 76 |
+
Download the raw ticker data from https://www.sec.gov/files/company_tickers.json
|
| 77 |
+
using the requests package. The data is saved to a local file.
|
| 78 |
+
If the download is successful, the raw data is saved as a JSON file.
|
| 79 |
+
If the download fails, an exception is raised.
|
| 80 |
+
"""
|
| 81 |
+
url = "https://www.sec.gov/files/company_tickers.json"
|
| 82 |
+
headers = {
|
| 83 |
+
"User-Agent": "censored_email_address",
|
| 84 |
+
"Accept-Encoding": "gzip, deflate",
|
| 85 |
+
"Host": "www.sec.gov",
|
| 86 |
+
"Connection": "keep-alive"
|
| 87 |
+
}
|
| 88 |
+
response = requests.get(url, headers=headers)
|
| 89 |
+
if response.status_code != 200:
|
| 90 |
+
raise Exception(f"Error downloading ticker data from url.\nResponse status code: {response.status_code}")
|
| 91 |
+
else:
|
| 92 |
+
df = pd.read_json(response.text)
|
| 93 |
+
# save the raw data
|
| 94 |
+
with open(self.fname_raw, 'w') as f:
|
| 95 |
+
df.to_json(f)
|
| 96 |
+
self.logger.info(f'Dowloaded raw ticker data into {self.fname_raw}')
|
| 97 |
+
|
| 98 |
+
def find_best_matching_title(
|
| 99 |
+
self,
|
| 100 |
+
input_name:str,
|
| 101 |
+
top_n=3) -> pd.DataFrame:
|
| 102 |
+
"""
|
| 103 |
+
Find the best matching company title for a given company name.
|
| 104 |
+
Args:
|
| 105 |
+
input_name : str
|
| 106 |
+
The name to search for
|
| 107 |
+
top_n : int, default=3
|
| 108 |
+
The number of top matches to return
|
| 109 |
+
Returns:
|
| 110 |
+
results : pd.DataFrame
|
| 111 |
+
A pd.df containing the matched title, ticker, and fuzzy matching score
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
matches = process.extract(
|
| 115 |
+
input_name.lower(),
|
| 116 |
+
self.df["title"].str.lower(),
|
| 117 |
+
# scorer=fuzz.WRatio,
|
| 118 |
+
# scorer=fuzz.partial_ratio,
|
| 119 |
+
scorer=fuzz.ratio,
|
| 120 |
+
limit=top_n)
|
| 121 |
+
results = [(self.df.iloc[idx]["title"], self.df.iloc[idx]["ticker"], score) for title, score, idx in matches]
|
| 122 |
+
df = pd.DataFrame(results, columns=["Title", "Ticker", "Score"])
|
| 123 |
+
return df
|
| 124 |
+
|
| 125 |
+
def find_best_matching_ticker(
|
| 126 |
+
self,
|
| 127 |
+
ticker:str,
|
| 128 |
+
top_n:int=3) -> pd.DataFrame:
|
| 129 |
+
"""
|
| 130 |
+
Find the best matching company ticker for a given ticker.
|
| 131 |
+
Args:
|
| 132 |
+
ticker : str
|
| 133 |
+
The ticker to search for
|
| 134 |
+
top_n : int, default=3
|
| 135 |
+
The number of top matches to return
|
| 136 |
+
Returns:
|
| 137 |
+
results : pd.DataFrame
|
| 138 |
+
A pd.df containing the title, matched ticker, and fuzzy matching score
|
| 139 |
+
"""
|
| 140 |
+
matches = process.extract(
|
| 141 |
+
ticker.upper(),
|
| 142 |
+
self.df["ticker"],
|
| 143 |
+
# scorer=fuzz.WRatio,
|
| 144 |
+
# scorer=fuzz.partial_ratio,
|
| 145 |
+
scorer=fuzz.ratio,
|
| 146 |
+
limit=top_n)
|
| 147 |
+
|
| 148 |
+
results = [(self.df.iloc[idx]["title"], self.df.iloc[idx]["ticker"], score) for ticker, score, idx in matches]
|
| 149 |
+
df = pd.DataFrame(results, columns=["Title", "Ticker", "Score"])
|
| 150 |
+
return df
|
| 151 |
+
|
| 152 |
+
def find_best_matching_ticker_or_title(
|
| 153 |
+
self,
|
| 154 |
+
user_input: str
|
| 155 |
+
) -> str:
|
| 156 |
+
"""
|
| 157 |
+
Find the best matching company ticker for a given user input, which may be a ticker or a title.
|
| 158 |
+
Args:
|
| 159 |
+
user_input : str
|
| 160 |
+
The user input to search for
|
| 161 |
+
Returns:
|
| 162 |
+
results : str
|
| 163 |
+
A string containing the best matching title and ticker
|
| 164 |
+
"""
|
| 165 |
+
# user may be trying to write a ticker, in which case find the best matching ticker:
|
| 166 |
+
ticker_matches = self.find_best_matching_ticker(user_input)
|
| 167 |
+
# user may be trying to write a title, in which case find the best matching title:
|
| 168 |
+
title_matches = self.find_best_matching_title(user_input)
|
| 169 |
+
# total matches:
|
| 170 |
+
c_matches = pd.concat([ticker_matches, title_matches])
|
| 171 |
+
# deduplicates:
|
| 172 |
+
c_matches_dedup = c_matches.groupby(['Ticker', 'Title'], as_index=False)['Score'].sum()
|
| 173 |
+
# sort by score:
|
| 174 |
+
c_matches_sorted = c_matches_dedup.sort_values(by='Score', ascending=False)
|
| 175 |
+
# convert results into a pretty string:
|
| 176 |
+
results = self.df_to_pretty_string(c_matches_sorted )
|
| 177 |
+
return(results)
|
| 178 |
+
|
| 179 |
+
def df_to_pretty_string(
|
| 180 |
+
self,
|
| 181 |
+
df):
|
| 182 |
+
"""
|
| 183 |
+
Convert a pd.DataFrame into a pretty string, using the tabulate package.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
df : pd.DataFrame
|
| 187 |
+
The dataframe to convert
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
pretty_string : str
|
| 191 |
+
A string containing the pretty-formatted dataframe
|
| 192 |
+
"""
|
| 193 |
+
pretty_string = tabulate(df[['Title', 'Ticker']],
|
| 194 |
+
# headers='keys',
|
| 195 |
+
tablefmt='plain',
|
| 196 |
+
showindex=False,
|
| 197 |
+
numalign='left',
|
| 198 |
+
stralign='left')
|
| 199 |
+
return pretty_string
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
if __name__ == "__main__":
|
| 203 |
+
# results = TickerFinder().download_ticker_data()
|
| 204 |
+
results = TickerFinder().find_best_matching_ticker_or_title("microsoft")
|
| 205 |
+
print(results)
|