Stock_Prediction / sentiment_analysis.py
Yilin98's picture
update to adapt new version
e5ce1f9
# Linear Algebra and DataFrames
import numpy as np
import pandas as pd
# Visualization libraries
import seaborn as sns
sns.set_style("whitegrid")
# NLP Preprocessing and Basic tools
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
from emoji import demojize
from data_loader_functions import *
## Crawl down the news from investing.com
def news_scraping(company):
# scrape news
urls = get_articles_urls('apple-computer', 1, 3)
if company == 'Amazon':
urls = get_articles_urls('amazon-com', 1, 3)
elif company == 'Meta':
urls = get_articles_urls('facebook', 1, 3)
articles_df = pd.DataFrame({'ticker':[],
'publish_date':[],
'title': [],
'body_text': [],
'url':[]})
articles_df=scrape_news(urls, articles_df, company)
# Checking the data for duplicates
articles_df[articles_df.duplicated('body_text',keep=False)].sort_values('body_text')
# Dropping all duplicates
articles_df.drop_duplicates(('body_text'), inplace=True)
return articles_df
## Fetch news from hopsworks
def fetching_news(company):
articles_df = get_news_from_hopsworks()
articles_df.loc[articles_df['ticker'] == company]
articles_df['publish_date'] = articles_df['publish_date'].apply(time_2_datetime)
return articles_df
## NLP Processes
# Remove mentions
def remove_urls(text):
return re.sub(r'https?://\S+|www\.\S_+', '', text)
def remove_usernames_ressource(text):
text_split = text.split("-",1)
if len(text_split)>1:
text=text_split[1]
text = re.sub(r'@[A-Za-z0-9_]+', ' ', text)
return text
# Remove hashtags
def remove_hashtags(text):
return re.sub("#[A-Za-z0-9_]+"," ", text)
# Remove punctuations
def remove_punctuation(text, punc_list):
return text.translate(str.maketrans('', '', punc_list))
# Convert emojis to texts
def convert_emojis(text):
return demojize(text).replace(":","")
# Apply the previous functions
def full_preprocessing(text):
"""
@param text (str): a string to be processed.
@return text (Str): the processed string.
"""
punc_list = string.punctuation
# Remove non-ascii words
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
# Replace '&' with 'and'
text = re.sub(r'&', 'and', text)
# Remove trailing whitespace
text = re.sub(r'\s+', ' ', text).strip()
text = remove_urls(text)
text = remove_usernames_ressource(text)
text = remove_hashtags(text)
text = remove_punctuation(text, punc_list)
text = convert_emojis(text)
return text.lower()
# Trasform text to tokens (separated words)
def tokenize(text) :
text = text.split()
return text
# Remove stopwords
def remove_stopwords(text, stop_words):
words_to_keep = ["not","no","nor"]
stopword = [elem for elem in stop_words if not elem in words_to_keep]
text = [w.lower() for w in text if not w.lower() in stopword]
return text
# Lemmatization
def lemmatize(text, wn):
text = [wn.lemmatize(word) for word in text]
return text
# Stemming
def stemming(text, ps, ls):
text = [ps.stem(word) for word in text]
text = [ls.stem(word) for word in text]
return text
def full_processing(df):
stop_words = stopwords.words('english')
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
ls = nltk.LancasterStemmer()
df["text_W_puncts"] =df["body_text"].apply(lambda x: full_preprocessing(x))
df["text_tokenized"] = df["text_W_puncts"].apply(lambda x: tokenize(x))
df["text_W_stopwords"] = df["text_tokenized"].apply(lambda x: remove_stopwords(x, stop_words))
df["text_lemmatized"] = df["text_W_stopwords"].apply(lambda x: lemmatize(x, wn))
df["text_stemmed"] = df["text_lemmatized"].apply(lambda x: stemming(x, ps, ls))
df["text_processed"] = df["text_stemmed"].apply(lambda x: ' '.join(str(e) for e in x))
return df
def nlp_processing(articles_df):
news=articles_df[['body_text','publish_date','title']]
# Number of mentions, hashtags, urls
cnt_1, cnt_2, cnt_3 = 0, 0, 0
max_len, min_len, mean_len = -float("inf"), float("inf"), 0
for row in news.values:
text = row[0] # 0 for text content
if "@" in text:
cnt_1 += 1
if "#" in text:
cnt_2 += 1
if 'http' or 'www' in text:
cnt_3 += 1
if len(text) < min_len:
min_len = len(text)
if len(text) > max_len:
max_len = len(text)
mean_len += len(text)
mean_len /= len(articles_df)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
articles_processed = full_processing(articles_df)
return articles_processed
## Vader Sentiment
def predicted_label(x):
if x<=-0.5:
return 0
elif x>=0.5:
return 2
else:
return 1
def score_Vader(df,analyzer):
df['neg'] = df['text_processed'].apply(lambda x:analyzer.polarity_scores(x)['neg'])
df['neu'] = df['text_processed'].apply(lambda x:analyzer.polarity_scores(x)['neu'])
df['pos'] = df['text_processed'].apply(lambda x:analyzer.polarity_scores(x)['pos'])
df['compound'] = df['text_processed'].apply(lambda x:analyzer.polarity_scores(x)['compound'])
df['predicted_class'] = df['compound'].map(predicted_label)
return df
def vader_sentiment(articles_processed):
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
articles_processed=score_Vader(articles_processed, analyzer)
return articles_processed
def sentiment_analysis(company, day):
articles_df = fetching_news(company)
articles_df = select_oneday_news(articles_df, day)
articles_df = articles_df.loc[articles_df['ticker'] == company.upper()]
# articles_processed = nlp_processing(articles_df)
# articles_sentimentalized = vader_sentiment(articles_processed)
return articles_df
## Aggregate News Sentiments Each Day
def aggregate_by_date(articles_sentiments):
articles_sentiments = change_date_format(articles_sentiments)
keep_columns = ['ticker', 'publish_date', 'neg', 'neu', 'pos', 'compound']
sentiment_df = articles_sentiments[keep_columns]
daily_sentiment = sentiment_df.groupby([sentiment_df['publish_date'].dt.date, 'ticker']).agg({'neg': 'mean', 'neu': 'mean', 'pos': 'mean', 'compound': 'mean'}).reset_index()
return daily_sentiment