Stock_Prediction / data_loader_functions.py
Yilin98's picture
update to adapt new version
e5ce1f9
from bs4 import BeautifulSoup
import requests
import pandas as pd
import itertools
import yfinance as yf
import hopsworks
from datetime import datetime, timedelta
## Fetch stock price data from Yahoo Finance
def get_stock_price(ticker, start_date, end_date):
company = 'APPLE'
if ticker == 'AMAZ':
company = 'AMAZON'
elif ticker == 'META':
company = 'META'
stock_df = yf.download(ticker, start=start_date, end=end_date)
stock_df = stock_df.reset_index(level=0)
stock_df.columns = stock_df.columns.str.lower()
stock_df.rename(columns={'adj close': 'adj_close'}, inplace=True)
stock_df.insert(0, 'name', company)
stock_df['date'] = pd.to_datetime(stock_df.date).dt.tz_localize(None)
return stock_df
## Fetch stock news from hopsworks
def time_2_datetime(x):
dt_obj = datetime.fromtimestamp(x / 1000)
return dt_obj
def get_stock_price_from_hopsworks(name):
project = hopsworks.login()
fs = project.get_feature_store()
stock_fg = fs.get_feature_group(name="stocks_fg", version=1)
query = stock_fg.select_all()
stock_df = query.read()
stock_df = stock_df.loc[stock_df['name'] == name.upper()]
stock_df['date'] = stock_df['date'].apply(time_2_datetime)
stock_df = stock_df.sort_values(by='date')
return stock_df.head(1)
## Scrape stock news from investing.com
def get_articles_urls(company,startpage, endpage):
urls=[]
for page in range(startpage, endpage):
if page % 100 == 0:
print(page)
url = f"https://www.investing.com/equities/{company}-inc-news/{page}"
page=requests.get(url)
soup=BeautifulSoup(page.text,'html.parser')
for elt in soup.find_all('div',attrs={'class':'mediumTitle1'})[1].find_all('article'):
urls.append('https://www.investing.com/'+elt.find('a')['href'])
return list(itertools.filterfalse(lambda x: x.startswith('https://www.investing.com//pro/offers'), urls))
def scrape_news(urls, df, company):
for url in urls:
page = requests.get(url)
soup=BeautifulSoup(page.text,'html.parser')
if type(soup.find('h1',attrs={'class':'articleHeader'})) is type(None):
print(url)
continue
Title=soup.find('h1',attrs={'class':'articleHeader'}).text.strip()
Date=soup.find('div',attrs={'class':'contentSectionDetails'}).find("span").text.strip()
Article=' '.join([x.get_text() for x in soup.find('div',attrs={'class':'WYSIWYG articlePage'}).find_all("p")]).replace('Position added successfully to:','').strip()
tmpdic = {'ticker': company, 'publish_date': Date, 'title': Title, 'body_text': Article, 'url': url}
df=df.append(pd.DataFrame(tmpdic, index=[0]))
return df
## Fetch stock news from hopsworks
def get_news_from_hopsworks():
project = hopsworks.login()
fs = project.get_feature_store()
news_fg = fs.get_feature_group(name="market_news_fg_for_three", version=1)
# try:
# feature_view = fs.get_feature_view(name="market_news", version=1)
# except:
# news_fg = fs.get_feature_group(name="market_news_fg", version=1)
# query = news_fg.select_all()
# feature_view = fs.create_feature_view(name="market_news",
# version=1,
# description="Read from market_news_fg",
# query=query)
query = news_fg.select_all()
return query.read()
## Fetch history prediction plot
def get_history_plot_from_hopsworks(ticker):
project = hopsworks.login()
dataset_api = project.get_dataset_api()
if ticker == 'AAPL':
dataset_api.download("Resources/images/apple_stock_prediction.png", overwrite=True)
elif ticker == 'AMZN':
dataset_api.download("Resources/images/amazon_stock_prediction.png", overwrite=True)
else:
dataset_api.download("Resources/images/meta_stock_prediction.png", overwrite=True)
return
## Formalize the date column
def remove_parentheses(s):
if '(' in s:
return s[s.find("(")+1:s.find(")")]
else:
return s
def change_date_format(df):
if df['publish_date'].dtype == object:
df.publish_date = df.publish_date.apply(remove_parentheses)
df['publish_date'] = pd.to_datetime(df['publish_date'], format='%b %d, %Y %I:%M%p ET')
return df
def select_oneday_news(df, day):
df_copy = df.copy()
df['date'] = change_date_format(df_copy)['publish_date']
df['date'] = df['date'].apply(lambda x : x.date())
df = df.loc[df['date'] == day.date()]
df = df.drop('date', axis=1)
return df