StockAnalysisAgent / src /ticker_finder.py
OnurKerimoglu's picture
added ticker finder in the app, match results in html
ac40eee
import json
import logging
import os
import pandas as pd
from rapidfuzz import process, fuzz
import requests
from tabulate import tabulate
class TickerFinder():
"""
A class for finding the best matching ticker for a given company name or ticker.
Uses data from https://www.sec.gov/files/company_tickers.json and rapidfuzz package for fuzzy matching.
"""
def __init__(self):
"""
Initialize the TickerFinder object.
This method sets the file paths and reads the ticker data into the self.df attribute.
"""
self.logger = logging.getLogger(__name__)
self.rootdir = os.path.dirname(os.path.dirname(__file__))
self.fname_raw = os.path.join(self.rootdir, 'data_raw', 'sec_gov_company_tickers_test.json')
self.fname_compact = os.path.join(self.rootdir, 'data', 'sec_gov_company_tickers_compact.json')
self.df = self.read_ticker_data()
self.logger.info('Initialized TickerFinder object')
def read_ticker_data(
self
) -> pd.DataFrame:
"""
Read compact ticker data from a local file.
Returns
df : pandas DataFrame
"""
# if the compact data is not available, create it
if not os.path.exists(self.fname_compact):
self.logger.info(f'Compact ticker data was not found at {self.fname_compact}, creating it')
self.compact_ticker_data()
with open(self.fname_compact, 'r') as f:
data = json.load(f)
df = pd.DataFrame.from_dict(data, orient='columns')
self.logger.info(f'Read compact ticker data from {self.fname_compact}')
return df
def compact_ticker_data(
self
) -> None:
"""
Compact the raw ticker data by extracting only the ticker and title fields and
saving them to a local file.
If the raw data is not available, this method will download it first.
"""
if not os.path.exists(self.fname_raw):
self.logger.info(f'Raw ticker data was not found at {self.fname_raw}, downloading it')
self.download_ticker_data()
# read the raw data
with open(self.fname_raw, 'r') as f:
data = json.load(f)
# extract the necessary fields
titles = [None]*len(data)
tickers = [None]*len(data)
for k, v in data.items():
i = int(k)
titles[i] = v['title']
tickers[i] = v['ticker']
data_compact = {'ticker': tickers, 'title': titles}
# save the compact data
with open(self.fname_compact, 'w') as f:
json.dump(data_compact, f)
self.logger.info(f'Compacted raw ticker data into {self.fname_compact}')
def download_ticker_data(
self
) -> None:
"""
Download the raw ticker data from https://www.sec.gov/files/company_tickers.json
using the requests package. The data is saved to a local file.
If the download is successful, the raw data is saved as a JSON file.
If the download fails, an exception is raised.
"""
url = "https://www.sec.gov/files/company_tickers.json"
headers = {
"User-Agent": "censored_email_address",
"Accept-Encoding": "gzip, deflate",
"Host": "www.sec.gov",
"Connection": "keep-alive"
}
response = requests.get(url, headers=headers)
if response.status_code != 200:
raise Exception(f"Error downloading ticker data from url.\nResponse status code: {response.status_code}")
else:
df = pd.read_json(response.text)
# save the raw data
with open(self.fname_raw, 'w') as f:
df.to_json(f)
self.logger.info(f'Dowloaded raw ticker data into {self.fname_raw}')
def find_best_matching_title(
self,
input_name:str,
top_n=5) -> pd.DataFrame:
"""
Find the best matching company title for a given company name.
Args:
input_name : str
The name to search for
top_n : int, default=3
The number of top matches to return
Returns:
results : pd.DataFrame
A pd.df containing the matched title, ticker, and fuzzy matching score
"""
matches = process.extract(
input_name.lower(),
self.df["title"].str.lower(),
# scorer=fuzz.WRatio,
# scorer=fuzz.partial_ratio,
scorer=fuzz.ratio,
limit=top_n)
results = [(self.df.iloc[idx]["title"], self.df.iloc[idx]["ticker"], score) for title, score, idx in matches]
df = pd.DataFrame(results, columns=["Title", "Ticker", "Score"])
return df
def find_best_matching_ticker(
self,
ticker:str,
top_n:int=5
) -> pd.DataFrame:
"""
Find the best matching company ticker for a given ticker.
Args:
ticker : str
The ticker to search for
top_n : int, default=3
The number of top matches to return
Returns:
results : pd.DataFrame
A pd.df containing the title, matched ticker, and fuzzy matching score
"""
matches = process.extract(
ticker.upper(),
self.df["ticker"],
# scorer=fuzz.WRatio,
# scorer=fuzz.partial_ratio,
scorer=fuzz.ratio,
limit=top_n)
results = [(self.df.iloc[idx]["title"], self.df.iloc[idx]["ticker"], score) for ticker, score, idx in matches]
df = pd.DataFrame(results, columns=["Title", "Ticker", "Score"])
return df
def find_best_matching_ticker_or_title(
self,
user_input: str
) -> str:
"""
Find the best matching company ticker for a given user input, which may be a ticker or a title.
Args:
user_input : str
The user input to search for
Returns:
results : str
A string containing the best matching title and ticker
"""
# user may be trying to write a ticker, in which case find the best matching ticker:
ticker_matches = self.find_best_matching_ticker(user_input)
# user may be trying to write a title, in which case find the best matching title:
title_matches = self.find_best_matching_title(user_input)
# total matches:
c_matches = pd.concat([ticker_matches, title_matches])
# deduplicates:
c_matches_dedup = c_matches.groupby(['Ticker', 'Title'], as_index=False)['Score'].sum()
# sort by score:
c_matches_sorted = c_matches_dedup.sort_values(by='Score', ascending=False)
# convert results into a pretty string:
results = self.df_to_pretty_string(c_matches_sorted)
return(results)
def df_to_pretty_string(
self,
df:pd.DataFrame,
num_rows:int=5
) -> str:
"""
Convert a pd.DataFrame into a pretty string, using the tabulate package.
Args:
df : pd.DataFrame
The dataframe to convert
Returns:
pretty_string : str
A string containing the pretty-formatted dataframe
"""
df = df.rename(columns={'Title': 'Company Name', 'Ticker': 'Ticker Symbol'})
df_subset = df[['Company Name', 'Ticker Symbol']].iloc[0:num_rows]
pretty_table= tabulate(df_subset,
headers='keys',
# tablefmt='plain',
tablefmt='html',
showindex=False,
numalign='left',
stralign='left')
return pretty_table
def does_ticker_exist(
self,
ticker: str
) -> bool:
"""
Check whether a given ticker exists in the ticker data.
Args:
ticker : str
The ticker to check
Returns:
exists : bool
True if the ticker exists, False otherwise
"""
return ticker in self.df['ticker'].values
# if __name__ == "__main__":
# results = TickerFinder().find_best_matching_ticker_or_title("microsoft")
# print(results)
# exists = TickerFinder().does_ticker_exist('bbbbb')
# print(f'Ticker exists') if exists else print(f'Ticker does not exist')