Spaces:
Sleeping
Sleeping
| import os | |
| from typing import Annotated | |
| from pandas import DataFrame | |
| from finnlp.data_sources.news.cnbc_streaming import CNBC_Streaming | |
| from finnlp.data_sources.news.yicai_streaming import Yicai_Streaming | |
| from finnlp.data_sources.news.investorplace_streaming import InvestorPlace_Streaming | |
| # from finnlp.data_sources.news.eastmoney_streaming import Eastmoney_Streaming | |
| from finnlp.data_sources.social_media.xueqiu_streaming import Xueqiu_Streaming | |
| from finnlp.data_sources.social_media.stocktwits_streaming import Stocktwits_Streaming | |
| # from finnlp.data_sources.social_media.reddit_streaming import Reddit_Streaming | |
| from finnlp.data_sources.news.sina_finance_date_range import Sina_Finance_Date_Range | |
| from finnlp.data_sources.news.finnhub_date_range import Finnhub_Date_Range | |
| from ..utils import save_output, SavePathType | |
| US_Proxy = { | |
| "use_proxy": "us_free", | |
| "max_retry": 5, | |
| "proxy_pages": 5, | |
| } | |
| CN_Proxy = { | |
| "use_proxy": "china_free", | |
| "max_retry": 5, | |
| "proxy_pages": 5, | |
| } | |
| def streaming_download(streaming, config, tag, keyword, rounds, selected_columns, save_path): | |
| downloader = streaming(config) | |
| if hasattr(downloader, 'download_streaming_search'): | |
| downloader.download_streaming_search(keyword, rounds) | |
| elif hasattr(downloader, 'download_streaming_stock'): | |
| downloader.download_streaming_stock(keyword, rounds) | |
| else: | |
| downloader.download_streaming_all(rounds) | |
| # print(downloader.dataframe.columns) | |
| selected = downloader.dataframe[selected_columns] | |
| save_output(selected, tag, save_path) | |
| return selected | |
| def date_range_download(date_range, config, tag, start_date, end_date, stock, selected_columns, save_path): | |
| downloader = date_range(config) | |
| if hasattr(downloader, 'download_date_range_stock'): | |
| downloader.download_date_range_stock(start_date, end_date, stock) | |
| else: | |
| downloader.download_date_range_all(start_date, end_date) | |
| if hasattr(downloader, 'gather_content'): | |
| downloader.gather_content() | |
| # print(downloader.dataframe.columns) | |
| selected_news = downloader.dataframe[selected_columns] | |
| save_output(selected_news, tag, save_path) | |
| return selected_news | |
| class FinNLPUtils: | |
| """ | |
| Streaming News Download | |
| """ | |
| def cnbc_news_download( | |
| keyword: Annotated[str, "Keyword to search in news stream"], | |
| rounds: Annotated[int, "Number of rounds to search. Default to 1"] = 1, | |
| selected_columns: Annotated[list[str], "List of column names of news to return, should be chosen from 'description', 'cn:lastPubDate', 'dateModified', 'cn:dateline', 'cn:branding', 'section', 'cn:type', 'author', 'cn:source', 'cn:subtype', 'duration', 'summary', 'expires', 'cn:sectionSubType', 'cn:contentClassification', 'pubdateunix', 'url', 'datePublished', 'cn:promoImage', 'cn:title', 'cn:keyword', 'cn:liveURL', 'brand', 'hint', 'hint_detail'. Default to ['author', 'datePublished', 'description' ,'section', 'cn:title', 'summary']"] = ["author", "datePublished", "description" ,"section", "cn:title", "summary"], | |
| save_path: SavePathType = None | |
| ) -> DataFrame: | |
| return streaming_download(CNBC_Streaming, {}, "CNBC News", keyword, rounds, selected_columns, save_path) | |
| def yicai_news_download( | |
| keyword: Annotated[str, "Keyword to search in news stream"], | |
| rounds: Annotated[int, "Number of rounds to search. Default to 1"] = 1, | |
| selected_columns: Annotated[list[str], "List of column names of news to return, should be chosen from 'author','channelid','creationDate','desc','id','previewImage','source','tags','title','topics','typeo','url','weight'. Default to ['author', 'creationDate', 'desc' ,'source', 'title']"] = ["author", "creationDate", "desc" ,"source", "title"], | |
| save_path: SavePathType = None | |
| ) -> DataFrame: | |
| return streaming_download(Yicai_Streaming, {}, "Yicai News", keyword, rounds, selected_columns, save_path) | |
| def investor_place_news_download( | |
| keyword: Annotated[str, "Keyword to search in news stream"], | |
| rounds: Annotated[int, "Number of rounds to search. Default to 1"] = 1, | |
| selected_columns: Annotated[list[str], "List of column names of news to return, should be chosen from 'title', 'time', 'author', 'summary'. Default to ['title', 'time', 'author', 'summary']"] = ['title', 'time', 'author', 'summary'], | |
| save_path: SavePathType = None | |
| ) -> DataFrame: | |
| return streaming_download(InvestorPlace_Streaming, {}, "Investor Place News", keyword, rounds, selected_columns, save_path) | |
| # def eastmoney_news_download( | |
| # stock: Annotated[str, "stock code, e.g. 600519"], | |
| # pages: Annotated[int, "Number of pages to retrieve. Default to 1"] = 1, | |
| # selected_columns: Annotated[list[str], "List of column names of news to return, should be chosen from 'title', 'time', 'author', 'summary'. Default to ['title', 'time', 'author', 'summary']"] = ['title', 'time', 'author', 'summary'], | |
| # verbose: Annotated[bool, "Whether to print downloaded news to console. Default to True"] = True, | |
| # save_path: Annotated[str, "If specified (recommended if the amount of news is large), the downloaded news will be saved to save_path, otherwise the news will be returned as a string. Default to None"] = None, | |
| # ) -> str: | |
| # return streaming_download(Eastmoney_Streaming, "Eastmoney", stock, pages, selected_columns, save_path) | |
| """ | |
| Date Range News Download | |
| """ | |
| def sina_finance_news_download( | |
| start_date: Annotated[str, "Start date of the news to retrieve, YYYY-mm-dd"], | |
| end_date: Annotated[str, "End date of the news to retrieve, YYYY-mm-dd"], | |
| selected_columns: Annotated[list[str], """ | |
| List of column names of news to return, should be chosen from | |
| 'mediaid', 'productid', 'summary', 'ctime', 'url', 'author', 'stitle', | |
| 'authoruid', 'wapsummary', 'images', 'level', 'keywords', 'mlids', | |
| 'wapurl', 'columnid', 'oid', 'img', 'subjectid', 'commentid', | |
| 'ipad_vid', 'vid', 'video_id', 'channelid', 'intime', | |
| 'video_time_length', 'categoryid', 'hqChart', 'intro', 'is_cre_manual', | |
| 'icons', 'mtime', 'media_name', 'title', 'docid', 'urls', 'templateid', | |
| 'lids', 'wapurls', 'ext', 'comment_reply', 'comment_show', 'comment_total', 'praise', | |
| 'dispraise', 'important', 'content'. Default to ['title', 'author', 'content'] | |
| """ | |
| ] = ['title', 'author', 'content'], | |
| save_path: SavePathType = None | |
| ) -> DataFrame: | |
| return date_range_download(Sina_Finance_Date_Range, {}, "Sina Finance News", start_date, end_date, None, selected_columns, save_path) | |
| def finnhub_news_download( | |
| start_date: Annotated[str, "Start date of the news to retrieve, YYYY-mm-dd"], | |
| end_date: Annotated[str, "End date of the news to retrieve, YYYY-mm-dd"], | |
| stock: Annotated[str, "Stock symbol, e.g. AAPL"], | |
| selected_columns: Annotated[list[str], "List of column names of news to return, should be chosen from 'category', 'datetime', 'headline', 'id', 'image', 'related', 'source', 'summary', 'url', 'content'. Default to ['headline', 'datetime', 'source', 'summary']"] = ['headline', 'datetime', 'source', 'summary'], | |
| save_path: SavePathType = None | |
| ) -> DataFrame: | |
| return date_range_download(Finnhub_Date_Range, {"token": os.environ['FINNHUB_API_KEY']}, "Finnhub News", start_date, end_date, stock, selected_columns, save_path) | |
| """ | |
| Social Media | |
| """ | |
| def xueqiu_social_media_download( | |
| stock: Annotated[str, "Stock symbol, e.g. 'AAPL'"], | |
| rounds: Annotated[int, "Number of rounds to search. Default to 1"] = 1, | |
| selected_columns: Annotated[list[str], """ | |
| List of column names of news to return, should be chosen from blocked', | |
| 'blocking', 'canEdit', 'commentId', 'controversial', | |
| 'created_at', 'description', 'donate_count', 'donate_snowcoin', | |
| 'editable', 'expend', 'fav_count', 'favorited', 'flags', 'flagsObj', | |
| 'hot', 'id', 'is_answer', 'is_bonus', 'is_refused', 'is_reward', | |
| 'is_ss_multi_pic', 'legal_user_visible', 'like_count', 'liked', 'mark', | |
| 'pic', 'promotion_id', 'reply_count', 'retweet_count', | |
| 'retweet_status_id', 'reward_count', 'reward_user_count', 'rqid', | |
| 'source', 'source_feed', 'source_link', 'target', 'text', 'timeBefore', | |
| 'title', 'trackJson', 'truncated', 'truncated_by', 'type', 'user', | |
| 'user_id', 'view_count', 'firstImg', 'pic_sizes', 'edited_at'. | |
| Default to ['created_at', 'description', 'title', 'text', 'target', 'source'] | |
| """] = ['created_at', 'description', 'title', 'text', 'target', 'source'], | |
| save_path: SavePathType = None | |
| ) -> DataFrame: | |
| return streaming_download(Xueqiu_Streaming, {}, "Xueqiu Social Media", stock, rounds, selected_columns, save_path) | |
| def stocktwits_social_media_download( | |
| stock: Annotated[str, "Stock symbol, e.g. 'AAPL'"], | |
| rounds: Annotated[int, "Number of rounds to search. Default to 1"] = 1, | |
| selected_columns: Annotated[list[str], """ | |
| List of column names of news to return, should be chosen from 'id', | |
| 'body', 'created_at', 'user', 'source', 'symbols', 'prices', | |
| 'mentioned_users', 'entities', 'liked_by_self', 'reshared_by_self', | |
| 'conversation', 'links', 'likes', 'reshare_message', 'structurable', | |
| 'reshares'. Default to ['created_at', 'body'] | |
| """] = ['created_at', 'body'], | |
| save_path: SavePathType = None | |
| ) -> DataFrame: | |
| return streaming_download(Stocktwits_Streaming, {}, "Stocktwits Social Media", stock, rounds, selected_columns, save_path) | |
| # def reddit_social_media_download( | |
| # pages: Annotated[int, "Number of pages to retrieve. Default to 1"] = 1, | |
| # selected_columns: Annotated[list[str], """ | |
| # List of column names of news to return, should be chosen from 'id', | |
| # 'body', 'created_at', 'user', 'source', 'symbols', 'prices', | |
| # 'mentioned_users', 'entities', 'liked_by_self', 'reshared_by_self', | |
| # 'conversation', 'links', 'likes', 'reshare_message', 'structurable', | |
| # 'reshares'. Default to ['created_at', 'body'] | |
| # """] = ['created_at', 'body'], | |
| # verbose: Annotated[bool, "Whether to print downloaded news to console. Default to True"] = True, | |
| # save_path: Annotated[str, "If specified (recommended if the amount of news is large), the downloaded news will be saved to save_path. Default to None"] = None, | |
| # ) -> DataFrame: | |
| # return streaming_download(Reddit_Streaming, {}, "Reddit Social Media", None, pages, selected_columns, save_path) | |
| """ | |
| Company Announcements | |
| (Not working well) | |
| """ | |
| # from finnlp.data_sources.company_announcement.sec import SEC_Announcement | |
| # from finnlp.data_sources.company_announcement.juchao import Juchao_Announcement | |
| # def sec_announcement_download( | |
| # start_date: Annotated[str, "Start date of the news to retrieve, YYYY-mm-dd"], | |
| # end_date: Annotated[str, "End date of the news to retrieve, YYYY-mm-dd"], | |
| # stock: Annotated[str, "Stock symbol, e.g. AAPL"], | |
| # selected_columns: Annotated[list[str], "List of column names of news to return, should be chosen from 'category', 'datetime', 'headline', 'id', 'image', 'related', 'source', 'summary', 'url', 'content'. Default to ['headline', 'datetime', 'source', 'summary']"] = ['headline', 'datetime', 'source', 'summary'], | |
| # verbose: Annotated[bool, "Whether to print downloaded news to console. Default to True"] = True, | |
| # save_path: Annotated[str, "If specified (recommended if the amount of news is large), the downloaded news will be saved to save_path. Default to None"] = None, | |
| # ) -> DataFrame: | |
| # return date_range_download(SEC_Announcement, {}, "SEC Announcements", start_date, end_date, stock, selected_columns, save_path) | |
| # def juchao_announcement_download( | |
| # start_date: Annotated[str, "Start date of the news to retrieve, YYYY-mm-dd"], | |
| # end_date: Annotated[str, "End date of the news to retrieve, YYYY-mm-dd"], | |
| # stock: Annotated[str, "Stock code, e.g. 000001"], | |
| # selected_columns: Annotated[list[str], "List of column names of news to return, should be chosen from 'category', 'datetime', 'headline', 'id', 'image', 'related', 'source', 'summary', 'url', 'content'. Default to ['headline', 'datetime', 'source', 'summary']"] = ['headline', 'datetime', 'source', 'summary'], | |
| # verbose: Annotated[bool, "Whether to print downloaded news to console. Default to True"] = True, | |
| # save_path: Annotated[str, "If specified (recommended if the amount of news is large), the downloaded news will be saved to save_path. Default to None"] = None, | |
| # ) -> DataFrame: | |
| # return date_range_download(Juchao_Announcement, {}, "Juchao Announcements", start_date, end_date, stock, selected_columns, save_path) | |
| if __name__ == "__main__": | |
| print(FinNLPUtils.yicai_news_download("茅台", save_path="yicai_maotai.csv")) | |
| # print(cnbc_news_download("tesla", save_path="cnbc_tesla.csv")) | |
| # investor_place_news_download("tesla", save_path="invpl_tesla.csv") | |
| # eastmoney_news_download("600519", save_path="estmny_maotai.csv") | |
| # sina_finance_news_download("2024-03-02", "2024-03-02", save_path="sina_news.csv") | |
| # finnhub_news_download("2024-03-02", "2024-03-02", "AAPL", save_path="finnhub_aapl_news.csv") | |
| # stocktwits_social_media_download("AAPL", save_path="stocktwits_aapl.csv") | |
| # xueqiu_social_media_download("茅台", save_path="xueqiu_maotai.csv") | |
| # reddit_social_media_download(save_path="reddit_social_media.csv") | |
| # juchao_announcement_download("000001", "2020-01-01", "2020-06-01", save_path="sec_announcement.csv") |