Spaces:

OnursFriends
/

StockAnalysisAgent

Running

App Files Files Community

StockAnalysisAgent / src /ticker_finder.py

OnurKerimoglu

added ticker finder in the app, match results in html

ac40eee 11 months ago

raw

history blame contribute delete

8.44 kB

	import json
	import logging
	import os
	import pandas as pd
	from rapidfuzz import process, fuzz
	import requests
	from tabulate import tabulate


	class TickerFinder():
	"""
	A class for finding the best matching ticker for a given company name or ticker.
	Uses data from https://www.sec.gov/files/company_tickers.json and rapidfuzz package for fuzzy matching.
	"""
	def __init__(self):
	"""
	Initialize the TickerFinder object.
	This method sets the file paths and reads the ticker data into the self.df attribute.
	"""
	self.logger = logging.getLogger(__name__)
	self.rootdir = os.path.dirname(os.path.dirname(__file__))
	self.fname_raw = os.path.join(self.rootdir, 'data_raw', 'sec_gov_company_tickers_test.json')
	self.fname_compact = os.path.join(self.rootdir, 'data', 'sec_gov_company_tickers_compact.json')
	self.df = self.read_ticker_data()
	self.logger.info('Initialized TickerFinder object')

	def read_ticker_data(
	self
	) -> pd.DataFrame:
	"""
	Read compact ticker data from a local file.
	Returns
	df : pandas DataFrame
	"""
	# if the compact data is not available, create it
	if not os.path.exists(self.fname_compact):
	self.logger.info(f'Compact ticker data was not found at {self.fname_compact}, creating it')
	self.compact_ticker_data()
	with open(self.fname_compact, 'r') as f:
	data = json.load(f)
	df = pd.DataFrame.from_dict(data, orient='columns')
	self.logger.info(f'Read compact ticker data from {self.fname_compact}')
	return df

	def compact_ticker_data(
	self
	) -> None:
	"""
	Compact the raw ticker data by extracting only the ticker and title fields and
	saving them to a local file.
	If the raw data is not available, this method will download it first.
	"""
	if not os.path.exists(self.fname_raw):
	self.logger.info(f'Raw ticker data was not found at {self.fname_raw}, downloading it')
	self.download_ticker_data()
	# read the raw data
	with open(self.fname_raw, 'r') as f:
	data = json.load(f)
	# extract the necessary fields
	titles = [None]*len(data)
	tickers = [None]*len(data)
	for k, v in data.items():
	i = int(k)
	titles[i] = v['title']
	tickers[i] = v['ticker']
	data_compact = {'ticker': tickers, 'title': titles}
	# save the compact data
	with open(self.fname_compact, 'w') as f:
	json.dump(data_compact, f)
	self.logger.info(f'Compacted raw ticker data into {self.fname_compact}')

	def download_ticker_data(
	self
	) -> None:
	"""
	Download the raw ticker data from https://www.sec.gov/files/company_tickers.json
	using the requests package. The data is saved to a local file.
	If the download is successful, the raw data is saved as a JSON file.
	If the download fails, an exception is raised.
	"""
	url = "https://www.sec.gov/files/company_tickers.json"
	headers = {
	"User-Agent": "censored_email_address",
	"Accept-Encoding": "gzip, deflate",
	"Host": "www.sec.gov",
	"Connection": "keep-alive"
	}
	response = requests.get(url, headers=headers)
	if response.status_code != 200:
	raise Exception(f"Error downloading ticker data from url.\nResponse status code: {response.status_code}")
	else:
	df = pd.read_json(response.text)
	# save the raw data
	with open(self.fname_raw, 'w') as f:
	df.to_json(f)
	self.logger.info(f'Dowloaded raw ticker data into {self.fname_raw}')

	def find_best_matching_title(
	self,
	input_name:str,
	top_n=5) -> pd.DataFrame:
	"""
	Find the best matching company title for a given company name.
	Args:
	input_name : str
	The name to search for
	top_n : int, default=3
	The number of top matches to return
	Returns:
	results : pd.DataFrame
	A pd.df containing the matched title, ticker, and fuzzy matching score
	"""
	matches = process.extract(
	input_name.lower(),
	self.df["title"].str.lower(),
	# scorer=fuzz.WRatio,
	# scorer=fuzz.partial_ratio,
	scorer=fuzz.ratio,
	limit=top_n)
	results = [(self.df.iloc[idx]["title"], self.df.iloc[idx]["ticker"], score) for title, score, idx in matches]
	df = pd.DataFrame(results, columns=["Title", "Ticker", "Score"])
	return df

	def find_best_matching_ticker(
	self,
	ticker:str,
	top_n:int=5
	) -> pd.DataFrame:
	"""
	Find the best matching company ticker for a given ticker.
	Args:
	ticker : str
	The ticker to search for
	top_n : int, default=3
	The number of top matches to return
	Returns:
	results : pd.DataFrame
	A pd.df containing the title, matched ticker, and fuzzy matching score
	"""
	matches = process.extract(
	ticker.upper(),
	self.df["ticker"],
	# scorer=fuzz.WRatio,
	# scorer=fuzz.partial_ratio,
	scorer=fuzz.ratio,
	limit=top_n)

	results = [(self.df.iloc[idx]["title"], self.df.iloc[idx]["ticker"], score) for ticker, score, idx in matches]
	df = pd.DataFrame(results, columns=["Title", "Ticker", "Score"])
	return df

	def find_best_matching_ticker_or_title(
	self,
	user_input: str
	) -> str:
	"""
	Find the best matching company ticker for a given user input, which may be a ticker or a title.
	Args:
	user_input : str
	The user input to search for
	Returns:
	results : str
	A string containing the best matching title and ticker
	"""
	# user may be trying to write a ticker, in which case find the best matching ticker:
	ticker_matches = self.find_best_matching_ticker(user_input)
	# user may be trying to write a title, in which case find the best matching title:
	title_matches = self.find_best_matching_title(user_input)
	# total matches:
	c_matches = pd.concat([ticker_matches, title_matches])
	# deduplicates:
	c_matches_dedup = c_matches.groupby(['Ticker', 'Title'], as_index=False)['Score'].sum()
	# sort by score:
	c_matches_sorted = c_matches_dedup.sort_values(by='Score', ascending=False)
	# convert results into a pretty string:
	results = self.df_to_pretty_string(c_matches_sorted)
	return(results)

	def df_to_pretty_string(
	self,
	df:pd.DataFrame,
	num_rows:int=5
	) -> str:
	"""
	Convert a pd.DataFrame into a pretty string, using the tabulate package.
	Args:
	df : pd.DataFrame
	The dataframe to convert
	Returns:
	pretty_string : str
	A string containing the pretty-formatted dataframe
	"""
	df = df.rename(columns={'Title': 'Company Name', 'Ticker': 'Ticker Symbol'})
	df_subset = df[['Company Name', 'Ticker Symbol']].iloc[0:num_rows]
	pretty_table= tabulate(df_subset,
	headers='keys',
	# tablefmt='plain',
	tablefmt='html',
	showindex=False,
	numalign='left',
	stralign='left')
	return pretty_table

	def does_ticker_exist(
	self,
	ticker: str
	) -> bool:
	"""
	Check whether a given ticker exists in the ticker data.
	Args:
	ticker : str
	The ticker to check
	Returns:
	exists : bool
	True if the ticker exists, False otherwise
	"""
	return ticker in self.df['ticker'].values


	# if __name__ == "__main__":
	# results = TickerFinder().find_best_matching_ticker_or_title("microsoft")
	# print(results)
	# exists = TickerFinder().does_ticker_exist('bbbbb')
	# print(f'Ticker exists') if exists else print(f'Ticker does not exist')