File size: 6,598 Bytes
5676a84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5ca58f
5676a84
 
 
 
 
 
 
 
 
 
 
 
aabb418
5676a84
 
 
 
 
 
 
 
f5ca58f
e5ce1f9
e43b3fc
f5ca58f
ff36459
 
f5ca58f
 
5676a84
 
 
e5ce1f9
 
 
5676a84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e43b3fc
2badb87
e5ce1f9
e43b3fc
 
 
5676a84
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# Linear Algebra and DataFrames
import numpy as np 
import pandas as pd 

# Visualization libraries
import seaborn as sns
sns.set_style("whitegrid")

# NLP Preprocessing and Basic tools
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
from emoji import demojize

from data_loader_functions import *


## Crawl down the news from investing.com
def news_scraping(company):
    # scrape news
    urls = get_articles_urls('apple-computer', 1, 3)
    if company == 'Amazon':
        urls = get_articles_urls('amazon-com', 1, 3)
    elif company == 'Meta':
        urls = get_articles_urls('facebook', 1, 3)
    articles_df = pd.DataFrame({'ticker':[],
                                    'publish_date':[],
                                    'title': [],
                                    'body_text': [],
                                    'url':[]})
    articles_df=scrape_news(urls, articles_df, company)

    # Checking the data for duplicates
    articles_df[articles_df.duplicated('body_text',keep=False)].sort_values('body_text')

    # Dropping all duplicates
    articles_df.drop_duplicates(('body_text'), inplace=True)
    return articles_df


## Fetch news from hopsworks
def fetching_news(company):
    articles_df = get_news_from_hopsworks()
    articles_df.loc[articles_df['ticker'] == company]
    articles_df['publish_date'] = articles_df['publish_date'].apply(time_2_datetime)
    return articles_df


## NLP Processes
# Remove mentions
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S_+', '', text)

def remove_usernames_ressource(text):
    text_split = text.split("-",1)
    if len(text_split)>1:
        text=text_split[1]
    text = re.sub(r'@[A-Za-z0-9_]+', ' ', text)
    return text

# Remove hashtags
def remove_hashtags(text):
    return re.sub("#[A-Za-z0-9_]+"," ", text)

# Remove punctuations
def remove_punctuation(text, punc_list):
    return text.translate(str.maketrans('', '', punc_list))

# Convert emojis to texts
def convert_emojis(text):
    return demojize(text).replace(":","")

# Apply the previous functions
def full_preprocessing(text):
    """
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    punc_list = string.punctuation

    # Remove non-ascii words
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # Replace '&' with 'and'
    text = re.sub(r'&', 'and', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    text = remove_urls(text)
    text = remove_usernames_ressource(text)
    text = remove_hashtags(text)
    text = remove_punctuation(text, punc_list)
    text = convert_emojis(text)

    return text.lower()

# Trasform text to tokens (separated words)
def tokenize(text) :
    text = text.split()
    return text

# Remove stopwords
def remove_stopwords(text, stop_words):
    words_to_keep = ["not","no","nor"]
    stopword = [elem for elem in stop_words if not elem in words_to_keep]
    text = [w.lower() for w in text if not w.lower() in stopword]
    return text

# Lemmatization
def lemmatize(text, wn):
    text = [wn.lemmatize(word) for word in text]
    return text

# Stemming
def stemming(text, ps, ls):
    text = [ps.stem(word) for word in text]
    text = [ls.stem(word) for word in text]
    return text

def full_processing(df):
    stop_words = stopwords.words('english')
    wn = nltk.WordNetLemmatizer()
    ps = nltk.PorterStemmer()
    ls = nltk.LancasterStemmer()

    df["text_W_puncts"] =df["body_text"].apply(lambda x: full_preprocessing(x))
    df["text_tokenized"] = df["text_W_puncts"].apply(lambda x: tokenize(x))
    df["text_W_stopwords"] = df["text_tokenized"].apply(lambda x: remove_stopwords(x, stop_words))
    df["text_lemmatized"] = df["text_W_stopwords"].apply(lambda x: lemmatize(x, wn))
    df["text_stemmed"] = df["text_lemmatized"].apply(lambda x: stemming(x, ps, ls))
    df["text_processed"] = df["text_stemmed"].apply(lambda x: ' '.join(str(e) for e in x))
    
    return df
    

def nlp_processing(articles_df):
    news=articles_df[['body_text','publish_date','title']]
    # Number of mentions, hashtags, urls
    cnt_1, cnt_2, cnt_3 = 0, 0, 0
    max_len, min_len, mean_len = -float("inf"), float("inf"), 0
    for row in news.values:
        text = row[0]  # 0 for text content
        if "@" in text:
            cnt_1 += 1
        if "#" in text:
            cnt_2 += 1
        if 'http' or 'www' in text:
            cnt_3 += 1
        if len(text) < min_len:
            min_len = len(text)
        if len(text) > max_len:
            max_len = len(text)

        mean_len += len(text)
        
    mean_len /= len(articles_df)
    
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

    articles_processed = full_processing(articles_df)

    return articles_processed

    
## Vader Sentiment
def predicted_label(x):
  if x<=-0.5:
    return 0
  elif x>=0.5:
    return 2
  else:
    return 1

def score_Vader(df,analyzer):
  df['neg'] = df['text_processed'].apply(lambda x:analyzer.polarity_scores(x)['neg'])
  df['neu'] = df['text_processed'].apply(lambda x:analyzer.polarity_scores(x)['neu'])
  df['pos'] = df['text_processed'].apply(lambda x:analyzer.polarity_scores(x)['pos'])
  df['compound'] = df['text_processed'].apply(lambda x:analyzer.polarity_scores(x)['compound'])
  
  df['predicted_class'] = df['compound'].map(predicted_label) 
  return df

def vader_sentiment(articles_processed):
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    analyzer = SentimentIntensityAnalyzer()
    articles_processed=score_Vader(articles_processed, analyzer)
    return articles_processed

def sentiment_analysis(company, day):
    articles_df = fetching_news(company)
    articles_df = select_oneday_news(articles_df, day)
    articles_df = articles_df.loc[articles_df['ticker'] == company.upper()]
    # articles_processed = nlp_processing(articles_df)
    # articles_sentimentalized = vader_sentiment(articles_processed)
    return articles_df

## Aggregate News Sentiments Each Day
def aggregate_by_date(articles_sentiments):
    articles_sentiments = change_date_format(articles_sentiments)
    keep_columns = ['ticker', 'publish_date', 'neg', 'neu', 'pos', 'compound']
    sentiment_df = articles_sentiments[keep_columns]
    daily_sentiment = sentiment_df.groupby([sentiment_df['publish_date'].dt.date, 'ticker']).agg({'neg': 'mean', 'neu': 'mean', 'pos': 'mean', 'compound': 'mean'}).reset_index()
    return daily_sentiment