OnurKerimoglu commited on
Commit
61b2df0
·
1 Parent(s): e25e756

introduced

Browse files
Files changed (2) hide show
  1. requirements.txt +3 -0
  2. src/ticker_finder.py +205 -0
requirements.txt CHANGED
@@ -20,6 +20,7 @@ executing==2.2.0
20
  fastapi==0.115.8
21
  ffmpy==0.5.0
22
  filelock==3.17.0
 
23
  fonttools==4.55.8
24
  frozendict==2.4.6
25
  frozenlist==1.5.0
@@ -105,6 +106,7 @@ python-multipart==0.0.20
105
  pytz==2025.1
106
  PyYAML==6.0.2
107
  pyzmq==26.2.1
 
108
  regex==2024.11.6
109
  requests==2.32.3
110
  requests-toolbelt==1.0.0
@@ -125,6 +127,7 @@ stack-data==0.6.3
125
  starlette==0.45.3
126
  sympy==1.13.1
127
  ta==0.11.0
 
128
  tenacity==9.0.0
129
  threadpoolctl==3.5.0
130
  tiktoken==0.8.0
 
20
  fastapi==0.115.8
21
  ffmpy==0.5.0
22
  filelock==3.17.0
23
+ -e git+https://github.com/OnurKerimoglu/financial_agents.git@e25e7566c95beb6678ff63f4c3fb7798e1f6ec27#egg=Financial_Agents
24
  fonttools==4.55.8
25
  frozendict==2.4.6
26
  frozenlist==1.5.0
 
106
  pytz==2025.1
107
  PyYAML==6.0.2
108
  pyzmq==26.2.1
109
+ RapidFuzz==3.12.1
110
  regex==2024.11.6
111
  requests==2.32.3
112
  requests-toolbelt==1.0.0
 
127
  starlette==0.45.3
128
  sympy==1.13.1
129
  ta==0.11.0
130
+ tabulate==0.9.0
131
  tenacity==9.0.0
132
  threadpoolctl==3.5.0
133
  tiktoken==0.8.0
src/ticker_finder.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import pandas as pd
5
+ from rapidfuzz import process, fuzz
6
+ import requests
7
+ from tabulate import tabulate
8
+
9
+
10
+ class TickerFinder():
11
+ """
12
+ A class for finding the best matching ticker for a given company name or ticker.
13
+ Uses data from https://www.sec.gov/files/company_tickers.json and rapidfuzz package for fuzzy matching.
14
+ """
15
+ def __init__(self):
16
+ """
17
+ Initialize the TickerFinder object.
18
+ This method sets the file paths and reads the ticker data into the self.df attribute.
19
+ """
20
+ self.logger = logging.getLogger(__name__)
21
+ self.rootdir = os.path.dirname(os.path.dirname(__file__))
22
+ self.fname_raw = os.path.join(self.rootdir, 'data_raw', 'sec_gov_company_tickers_test.json')
23
+ self.fname_compact = os.path.join(self.rootdir, 'data', 'sec_gov_company_tickers_compact.json')
24
+ self.df = self.read_ticker_data()
25
+ self.logger.info('Initialized TickerFinder object')
26
+
27
+ def read_ticker_data(
28
+ self
29
+ ) -> pd.DataFrame:
30
+ """
31
+ Read compact ticker data from a local file.
32
+ Returns
33
+ df : pandas DataFrame
34
+ """
35
+ # if the compact data is not available, create it
36
+ if not os.path.exists(self.fname_compact):
37
+ self.logger.info(f'Compact ticker data was not found at {self.fname_compact}, creating it')
38
+ self.compact_ticker_data()
39
+ with open(self.fname_compact, 'r') as f:
40
+ data = json.load(f)
41
+ df = pd.DataFrame.from_dict(data, orient='columns')
42
+ self.logger.info(f'Read compact ticker data from {self.fname_compact}')
43
+ return df
44
+
45
+ def compact_ticker_data(
46
+ self
47
+ ) -> None:
48
+ """
49
+ Compact the raw ticker data by extracting only the ticker and title fields and
50
+ saving them to a local file.
51
+ If the raw data is not available, this method will download it first.
52
+ """
53
+ if not os.path.exists(self.fname_raw):
54
+ self.logger.info(f'Raw ticker data was not found at {self.fname_raw}, downloading it')
55
+ self.download_ticker_data()
56
+ # read the raw data
57
+ with open(self.fname_raw, 'r') as f:
58
+ data = json.load(f)
59
+ # extract the necessary fields
60
+ titles = [None]*len(data)
61
+ tickers = [None]*len(data)
62
+ for k, v in data.items():
63
+ i = int(k)
64
+ titles[i] = v['title']
65
+ tickers[i] = v['ticker']
66
+ data_compact = {'ticker': tickers, 'title': titles}
67
+ # save the compact data
68
+ with open(self.fname_compact, 'w') as f:
69
+ json.dump(data_compact, f)
70
+ self.logger.info(f'Compacted raw ticker data into {self.fname_compact}')
71
+
72
+ def download_ticker_data(
73
+ self
74
+ ) -> None:
75
+ """
76
+ Download the raw ticker data from https://www.sec.gov/files/company_tickers.json
77
+ using the requests package. The data is saved to a local file.
78
+ If the download is successful, the raw data is saved as a JSON file.
79
+ If the download fails, an exception is raised.
80
+ """
81
+ url = "https://www.sec.gov/files/company_tickers.json"
82
+ headers = {
83
+ "User-Agent": "censored_email_address",
84
+ "Accept-Encoding": "gzip, deflate",
85
+ "Host": "www.sec.gov",
86
+ "Connection": "keep-alive"
87
+ }
88
+ response = requests.get(url, headers=headers)
89
+ if response.status_code != 200:
90
+ raise Exception(f"Error downloading ticker data from url.\nResponse status code: {response.status_code}")
91
+ else:
92
+ df = pd.read_json(response.text)
93
+ # save the raw data
94
+ with open(self.fname_raw, 'w') as f:
95
+ df.to_json(f)
96
+ self.logger.info(f'Dowloaded raw ticker data into {self.fname_raw}')
97
+
98
+ def find_best_matching_title(
99
+ self,
100
+ input_name:str,
101
+ top_n=3) -> pd.DataFrame:
102
+ """
103
+ Find the best matching company title for a given company name.
104
+ Args:
105
+ input_name : str
106
+ The name to search for
107
+ top_n : int, default=3
108
+ The number of top matches to return
109
+ Returns:
110
+ results : pd.DataFrame
111
+ A pd.df containing the matched title, ticker, and fuzzy matching score
112
+ """
113
+
114
+ matches = process.extract(
115
+ input_name.lower(),
116
+ self.df["title"].str.lower(),
117
+ # scorer=fuzz.WRatio,
118
+ # scorer=fuzz.partial_ratio,
119
+ scorer=fuzz.ratio,
120
+ limit=top_n)
121
+ results = [(self.df.iloc[idx]["title"], self.df.iloc[idx]["ticker"], score) for title, score, idx in matches]
122
+ df = pd.DataFrame(results, columns=["Title", "Ticker", "Score"])
123
+ return df
124
+
125
+ def find_best_matching_ticker(
126
+ self,
127
+ ticker:str,
128
+ top_n:int=3) -> pd.DataFrame:
129
+ """
130
+ Find the best matching company ticker for a given ticker.
131
+ Args:
132
+ ticker : str
133
+ The ticker to search for
134
+ top_n : int, default=3
135
+ The number of top matches to return
136
+ Returns:
137
+ results : pd.DataFrame
138
+ A pd.df containing the title, matched ticker, and fuzzy matching score
139
+ """
140
+ matches = process.extract(
141
+ ticker.upper(),
142
+ self.df["ticker"],
143
+ # scorer=fuzz.WRatio,
144
+ # scorer=fuzz.partial_ratio,
145
+ scorer=fuzz.ratio,
146
+ limit=top_n)
147
+
148
+ results = [(self.df.iloc[idx]["title"], self.df.iloc[idx]["ticker"], score) for ticker, score, idx in matches]
149
+ df = pd.DataFrame(results, columns=["Title", "Ticker", "Score"])
150
+ return df
151
+
152
+ def find_best_matching_ticker_or_title(
153
+ self,
154
+ user_input: str
155
+ ) -> str:
156
+ """
157
+ Find the best matching company ticker for a given user input, which may be a ticker or a title.
158
+ Args:
159
+ user_input : str
160
+ The user input to search for
161
+ Returns:
162
+ results : str
163
+ A string containing the best matching title and ticker
164
+ """
165
+ # user may be trying to write a ticker, in which case find the best matching ticker:
166
+ ticker_matches = self.find_best_matching_ticker(user_input)
167
+ # user may be trying to write a title, in which case find the best matching title:
168
+ title_matches = self.find_best_matching_title(user_input)
169
+ # total matches:
170
+ c_matches = pd.concat([ticker_matches, title_matches])
171
+ # deduplicates:
172
+ c_matches_dedup = c_matches.groupby(['Ticker', 'Title'], as_index=False)['Score'].sum()
173
+ # sort by score:
174
+ c_matches_sorted = c_matches_dedup.sort_values(by='Score', ascending=False)
175
+ # convert results into a pretty string:
176
+ results = self.df_to_pretty_string(c_matches_sorted )
177
+ return(results)
178
+
179
+ def df_to_pretty_string(
180
+ self,
181
+ df):
182
+ """
183
+ Convert a pd.DataFrame into a pretty string, using the tabulate package.
184
+
185
+ Args:
186
+ df : pd.DataFrame
187
+ The dataframe to convert
188
+
189
+ Returns:
190
+ pretty_string : str
191
+ A string containing the pretty-formatted dataframe
192
+ """
193
+ pretty_string = tabulate(df[['Title', 'Ticker']],
194
+ # headers='keys',
195
+ tablefmt='plain',
196
+ showindex=False,
197
+ numalign='left',
198
+ stralign='left')
199
+ return pretty_string
200
+
201
+
202
+ if __name__ == "__main__":
203
+ # results = TickerFinder().download_ticker_data()
204
+ results = TickerFinder().find_best_matching_ticker_or_title("microsoft")
205
+ print(results)