File size: 4,454 Bytes
5676a84
 
 
 
 
 
f5ca58f
 
5676a84
 
f5ca58f
5676a84
 
 
 
 
 
 
 
 
 
 
f5ca58f
5676a84
e5ce1f9
 
 
5676a84
e5ce1f9
 
 
 
 
 
 
 
 
 
 
 
 
5676a84
f5ca58f
5676a84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5ca58f
 
 
 
e5ce1f9
e43b3fc
 
 
 
 
 
 
 
 
 
 
f5ca58f
f3586f7
 
 
 
 
 
 
 
 
 
 
 
5676a84
 
f5ca58f
 
 
 
5676a84
f5ca58f
5676a84
 
f5ca58f
5676a84
 
f5ca58f
 
 
 
 
 
5676a84
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from bs4 import BeautifulSoup
import requests
import pandas as pd
import itertools
import yfinance as yf

import hopsworks

from datetime import datetime, timedelta

## Fetch stock price data from Yahoo Finance
def get_stock_price(ticker, start_date, end_date):
  company = 'APPLE'
  if ticker == 'AMAZ':
    company = 'AMAZON'
  elif ticker == 'META':
    company = 'META'
  stock_df = yf.download(ticker, start=start_date, end=end_date)
  stock_df = stock_df.reset_index(level=0)
  stock_df.columns = stock_df.columns.str.lower()
  stock_df.rename(columns={'adj close': 'adj_close'}, inplace=True)
  stock_df.insert(0, 'name', company)
  stock_df['date'] = pd.to_datetime(stock_df.date).dt.tz_localize(None)
  return stock_df

## Fetch stock news from hopsworks
def time_2_datetime(x):
    
    dt_obj = datetime.fromtimestamp(x / 1000)
    return dt_obj

def get_stock_price_from_hopsworks(name):
  project = hopsworks.login()
  fs = project.get_feature_store() 
  stock_fg = fs.get_feature_group(name="stocks_fg", version=1)  
  query = stock_fg.select_all()
  stock_df = query.read()
  stock_df = stock_df.loc[stock_df['name'] == name.upper()]
  stock_df['date'] = stock_df['date'].apply(time_2_datetime)
  stock_df = stock_df.sort_values(by='date')
  return stock_df.head(1)

## Scrape stock news from investing.com
def get_articles_urls(company,startpage, endpage):
  urls=[]
  for page in range(startpage, endpage):
    if page % 100 == 0:
      print(page)
    url = f"https://www.investing.com/equities/{company}-inc-news/{page}"
    page=requests.get(url)
    soup=BeautifulSoup(page.text,'html.parser')
    for elt in soup.find_all('div',attrs={'class':'mediumTitle1'})[1].find_all('article'):
        urls.append('https://www.investing.com/'+elt.find('a')['href'])
  return list(itertools.filterfalse(lambda x: x.startswith('https://www.investing.com//pro/offers'), urls))

def scrape_news(urls, df, company):
  for url in urls:
    page = requests.get(url)
    soup=BeautifulSoup(page.text,'html.parser')
    if type(soup.find('h1',attrs={'class':'articleHeader'})) is type(None):
      print(url)
      continue
    Title=soup.find('h1',attrs={'class':'articleHeader'}).text.strip()
    Date=soup.find('div',attrs={'class':'contentSectionDetails'}).find("span").text.strip()
    Article=' '.join([x.get_text() for x in soup.find('div',attrs={'class':'WYSIWYG articlePage'}).find_all("p")]).replace('Position added successfully to:','').strip()
    tmpdic = {'ticker': company, 'publish_date': Date, 'title': Title, 'body_text': Article, 'url': url}
    df=df.append(pd.DataFrame(tmpdic, index=[0]))
  return df

## Fetch stock news from hopsworks
def get_news_from_hopsworks():
  project = hopsworks.login()
  fs = project.get_feature_store() 
  news_fg = fs.get_feature_group(name="market_news_fg_for_three", version=1)  
  # try: 
  #   feature_view = fs.get_feature_view(name="market_news", version=1)
  # except:
  #   news_fg = fs.get_feature_group(name="market_news_fg", version=1)
  #   query = news_fg.select_all()
  #   feature_view = fs.create_feature_view(name="market_news",
  #                                         version=1,
  #                                         description="Read from market_news_fg",
  #                                         query=query)
  query = news_fg.select_all()
  return query.read()

## Fetch history prediction plot
def get_history_plot_from_hopsworks(ticker):
  project = hopsworks.login()
  dataset_api = project.get_dataset_api()
  if ticker == 'AAPL':
    dataset_api.download("Resources/images/apple_stock_prediction.png", overwrite=True)
  elif ticker == 'AMZN':
    dataset_api.download("Resources/images/amazon_stock_prediction.png", overwrite=True)
  else:
    dataset_api.download("Resources/images/meta_stock_prediction.png", overwrite=True)
  return

## Formalize the date column
def remove_parentheses(s):
  if '(' in s:
    return s[s.find("(")+1:s.find(")")]
  else:
      return s
def change_date_format(df):
  if df['publish_date'].dtype == object:
    df.publish_date = df.publish_date.apply(remove_parentheses)
    df['publish_date'] = pd.to_datetime(df['publish_date'], format='%b %d, %Y %I:%M%p ET')
  return df

def select_oneday_news(df, day):
  df_copy = df.copy()
  df['date'] = change_date_format(df_copy)['publish_date']
  df['date'] = df['date'].apply(lambda x : x.date())
  df = df.loc[df['date'] == day.date()]
  df = df.drop('date', axis=1)
  return df