Spaces:

OnursFriends
/

StockAnalysisAgent

Sleeping

File size: 8,435 Bytes

import json
import logging
import os
import pandas as pd
from rapidfuzz import process, fuzz
import requests
from tabulate import tabulate


class TickerFinder():
    """
    A class for finding the best matching ticker for a given company name or ticker.
    Uses data from https://www.sec.gov/files/company_tickers.json and rapidfuzz package for fuzzy matching.
    """
    def __init__(self):
        """
        Initialize the TickerFinder object.
        This method sets the file paths and reads the ticker data into the self.df attribute. 
        """
        self.logger = logging.getLogger(__name__)
        self.rootdir = os.path.dirname(os.path.dirname(__file__))
        self.fname_raw = os.path.join(self.rootdir, 'data_raw', 'sec_gov_company_tickers_test.json')
        self.fname_compact = os.path.join(self.rootdir, 'data', 'sec_gov_company_tickers_compact.json')
        self.df = self.read_ticker_data()
        self.logger.info('Initialized TickerFinder object')

    def read_ticker_data(
            self
            ) -> pd.DataFrame:
        """
        Read compact ticker data from a local file.
        Returns
        df : pandas DataFrame
        """
        # if the compact data is not available, create it
        if not os.path.exists(self.fname_compact):
            self.logger.info(f'Compact ticker data was not found at {self.fname_compact}, creating it')
            self.compact_ticker_data()
        with open(self.fname_compact, 'r') as f:
            data = json.load(f)
        df = pd.DataFrame.from_dict(data, orient='columns')
        self.logger.info(f'Read compact ticker data from {self.fname_compact}')
        return df

    def compact_ticker_data(
            self
            ) -> None:
        """
        Compact the raw ticker data by extracting only the ticker and title fields and
        saving them to a local file.
        If the raw data is not available, this method will download it first.
        """
        if not os.path.exists(self.fname_raw):
            self.logger.info(f'Raw ticker data was not found at {self.fname_raw}, downloading it')
            self.download_ticker_data()
        # read the raw data
        with open(self.fname_raw, 'r') as f:
            data = json.load(f)
        # extract the necessary fields
        titles = [None]*len(data)
        tickers = [None]*len(data)
        for k, v in data.items():
            i = int(k)
            titles[i] = v['title']
            tickers[i] = v['ticker']
        data_compact = {'ticker': tickers, 'title': titles}
        # save the compact data
        with open(self.fname_compact, 'w') as f:
            json.dump(data_compact, f)
        self.logger.info(f'Compacted raw ticker data into {self.fname_compact}')
    
    def download_ticker_data(
            self
        ) -> None:
        """
        Download the raw ticker data from https://www.sec.gov/files/company_tickers.json
        using the requests package. The data is saved to a local file.
        If the download is successful, the raw data is saved as a JSON file.
        If the download fails, an exception is raised.
        """
        url = "https://www.sec.gov/files/company_tickers.json"
        headers = {
            "User-Agent": "censored_email_address",
            "Accept-Encoding": "gzip, deflate",
            "Host": "www.sec.gov",
            "Connection": "keep-alive"
        }
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            raise Exception(f"Error downloading ticker data from url.\nResponse status code: {response.status_code}")
        else:
            df = pd.read_json(response.text)
        # save the raw data
        with open(self.fname_raw, 'w') as f:
            df.to_json(f)
        self.logger.info(f'Dowloaded raw ticker data into {self.fname_raw}')

    def find_best_matching_title(
            self,
            input_name:str,
            top_n=5) -> pd.DataFrame:
        """
        Find the best matching company title for a given company name.
        Args:
        input_name : str
            The name to search for
        top_n : int, default=3
            The number of top matches to return
        Returns:
        results : pd.DataFrame
            A pd.df containing the matched title, ticker, and fuzzy matching score
        """
        matches = process.extract(
            input_name.lower(),
            self.df["title"].str.lower(),
            # scorer=fuzz.WRatio,
            # scorer=fuzz.partial_ratio,
            scorer=fuzz.ratio,
            limit=top_n)
        results = [(self.df.iloc[idx]["title"], self.df.iloc[idx]["ticker"], score) for title, score, idx in matches]
        df = pd.DataFrame(results, columns=["Title", "Ticker", "Score"])
        return df

    def find_best_matching_ticker(
            self,
            ticker:str,
            top_n:int=5
            ) -> pd.DataFrame:
        """
        Find the best matching company ticker for a given ticker.
        Args:
        ticker : str
            The ticker to search for
        top_n : int, default=3
            The number of top matches to return
        Returns:
        results : pd.DataFrame
            A pd.df containing the title, matched ticker, and fuzzy matching score
        """
        matches = process.extract(
            ticker.upper(),
            self.df["ticker"],
            # scorer=fuzz.WRatio,
            # scorer=fuzz.partial_ratio,
            scorer=fuzz.ratio,
            limit=top_n)

        results = [(self.df.iloc[idx]["title"], self.df.iloc[idx]["ticker"], score) for ticker, score, idx in matches]
        df = pd.DataFrame(results, columns=["Title", "Ticker", "Score"])
        return df

    def find_best_matching_ticker_or_title(
        self,
        user_input: str
        ) -> str:
        """
        Find the best matching company ticker for a given user input, which may be a ticker or a title.
        Args:
        user_input : str
            The user input to search for
        Returns:
        results : str
            A string containing the best matching title and ticker
        """
        # user may be trying to write a ticker, in which case find the best matching ticker:
        ticker_matches = self.find_best_matching_ticker(user_input)
        # user may be trying to write a title, in which case find the best matching title:
        title_matches = self.find_best_matching_title(user_input)
        # total matches:
        c_matches = pd.concat([ticker_matches, title_matches])
        # deduplicates:
        c_matches_dedup = c_matches.groupby(['Ticker', 'Title'], as_index=False)['Score'].sum()
        # sort by score:
        c_matches_sorted = c_matches_dedup.sort_values(by='Score', ascending=False)
        # convert results into a pretty string:
        results = self.df_to_pretty_string(c_matches_sorted)
        return(results)

    def df_to_pretty_string(
            self,
            df:pd.DataFrame,
            num_rows:int=5
            ) -> str:
        """
        Convert a pd.DataFrame into a pretty string, using the tabulate package.
        Args:
        df : pd.DataFrame
            The dataframe to convert
        Returns:
        pretty_string : str
            A string containing the pretty-formatted dataframe
        """
        df = df.rename(columns={'Title': 'Company Name', 'Ticker': 'Ticker Symbol'})
        df_subset = df[['Company Name', 'Ticker Symbol']].iloc[0:num_rows]  
        pretty_table= tabulate(df_subset, 
              headers='keys', 
              # tablefmt='plain', 
              tablefmt='html',
              showindex=False,
              numalign='left',
              stralign='left')
        return pretty_table
    
    def does_ticker_exist(
            self,
            ticker: str
            ) -> bool:
        """
        Check whether a given ticker exists in the ticker data.
        Args:
        ticker : str
            The ticker to check
        Returns:
        exists : bool
            True if the ticker exists, False otherwise
        """
        return ticker in self.df['ticker'].values


# if __name__ == "__main__":
#       results = TickerFinder().find_best_matching_ticker_or_title("microsoft")
#       print(results)
      # exists = TickerFinder().does_ticker_exist('bbbbb')
      # print(f'Ticker exists') if exists else print(f'Ticker does not exist')