Spaces:
Runtime error
Runtime error
| # Linear Algebra and DataFrames | |
| import numpy as np | |
| import pandas as pd | |
| # Visualization libraries | |
| import seaborn as sns | |
| sns.set_style("whitegrid") | |
| # NLP Preprocessing and Basic tools | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| import re | |
| import string | |
| from emoji import demojize | |
| from data_loader_functions import * | |
| ## Crawl down the news from investing.com | |
| def news_scraping(company): | |
| # scrape news | |
| urls = get_articles_urls('apple-computer', 1, 3) | |
| if company == 'Amazon': | |
| urls = get_articles_urls('amazon-com', 1, 3) | |
| elif company == 'Meta': | |
| urls = get_articles_urls('facebook', 1, 3) | |
| articles_df = pd.DataFrame({'ticker':[], | |
| 'publish_date':[], | |
| 'title': [], | |
| 'body_text': [], | |
| 'url':[]}) | |
| articles_df=scrape_news(urls, articles_df, company) | |
| # Checking the data for duplicates | |
| articles_df[articles_df.duplicated('body_text',keep=False)].sort_values('body_text') | |
| # Dropping all duplicates | |
| articles_df.drop_duplicates(('body_text'), inplace=True) | |
| return articles_df | |
| ## Fetch news from hopsworks | |
| def fetching_news(company): | |
| articles_df = get_news_from_hopsworks() | |
| articles_df.loc[articles_df['ticker'] == company] | |
| articles_df['publish_date'] = articles_df['publish_date'].apply(time_2_datetime) | |
| return articles_df | |
| ## NLP Processes | |
| # Remove mentions | |
| def remove_urls(text): | |
| return re.sub(r'https?://\S+|www\.\S_+', '', text) | |
| def remove_usernames_ressource(text): | |
| text_split = text.split("-",1) | |
| if len(text_split)>1: | |
| text=text_split[1] | |
| text = re.sub(r'@[A-Za-z0-9_]+', ' ', text) | |
| return text | |
| # Remove hashtags | |
| def remove_hashtags(text): | |
| return re.sub("#[A-Za-z0-9_]+"," ", text) | |
| # Remove punctuations | |
| def remove_punctuation(text, punc_list): | |
| return text.translate(str.maketrans('', '', punc_list)) | |
| # Convert emojis to texts | |
| def convert_emojis(text): | |
| return demojize(text).replace(":","") | |
| # Apply the previous functions | |
| def full_preprocessing(text): | |
| """ | |
| @param text (str): a string to be processed. | |
| @return text (Str): the processed string. | |
| """ | |
| punc_list = string.punctuation | |
| # Remove non-ascii words | |
| text = re.sub(r'[^\x00-\x7F]+', ' ', text) | |
| # Replace '&' with 'and' | |
| text = re.sub(r'&', 'and', text) | |
| # Remove trailing whitespace | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| text = remove_urls(text) | |
| text = remove_usernames_ressource(text) | |
| text = remove_hashtags(text) | |
| text = remove_punctuation(text, punc_list) | |
| text = convert_emojis(text) | |
| return text.lower() | |
| # Trasform text to tokens (separated words) | |
| def tokenize(text) : | |
| text = text.split() | |
| return text | |
| # Remove stopwords | |
| def remove_stopwords(text, stop_words): | |
| words_to_keep = ["not","no","nor"] | |
| stopword = [elem for elem in stop_words if not elem in words_to_keep] | |
| text = [w.lower() for w in text if not w.lower() in stopword] | |
| return text | |
| # Lemmatization | |
| def lemmatize(text, wn): | |
| text = [wn.lemmatize(word) for word in text] | |
| return text | |
| # Stemming | |
| def stemming(text, ps, ls): | |
| text = [ps.stem(word) for word in text] | |
| text = [ls.stem(word) for word in text] | |
| return text | |
| def full_processing(df): | |
| stop_words = stopwords.words('english') | |
| wn = nltk.WordNetLemmatizer() | |
| ps = nltk.PorterStemmer() | |
| ls = nltk.LancasterStemmer() | |
| df["text_W_puncts"] =df["body_text"].apply(lambda x: full_preprocessing(x)) | |
| df["text_tokenized"] = df["text_W_puncts"].apply(lambda x: tokenize(x)) | |
| df["text_W_stopwords"] = df["text_tokenized"].apply(lambda x: remove_stopwords(x, stop_words)) | |
| df["text_lemmatized"] = df["text_W_stopwords"].apply(lambda x: lemmatize(x, wn)) | |
| df["text_stemmed"] = df["text_lemmatized"].apply(lambda x: stemming(x, ps, ls)) | |
| df["text_processed"] = df["text_stemmed"].apply(lambda x: ' '.join(str(e) for e in x)) | |
| return df | |
| def nlp_processing(articles_df): | |
| news=articles_df[['body_text','publish_date','title']] | |
| # Number of mentions, hashtags, urls | |
| cnt_1, cnt_2, cnt_3 = 0, 0, 0 | |
| max_len, min_len, mean_len = -float("inf"), float("inf"), 0 | |
| for row in news.values: | |
| text = row[0] # 0 for text content | |
| if "@" in text: | |
| cnt_1 += 1 | |
| if "#" in text: | |
| cnt_2 += 1 | |
| if 'http' or 'www' in text: | |
| cnt_3 += 1 | |
| if len(text) < min_len: | |
| min_len = len(text) | |
| if len(text) > max_len: | |
| max_len = len(text) | |
| mean_len += len(text) | |
| mean_len /= len(articles_df) | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| nltk.download('omw-1.4') | |
| articles_processed = full_processing(articles_df) | |
| return articles_processed | |
| ## Vader Sentiment | |
| def predicted_label(x): | |
| if x<=-0.5: | |
| return 0 | |
| elif x>=0.5: | |
| return 2 | |
| else: | |
| return 1 | |
| def score_Vader(df,analyzer): | |
| df['neg'] = df['text_processed'].apply(lambda x:analyzer.polarity_scores(x)['neg']) | |
| df['neu'] = df['text_processed'].apply(lambda x:analyzer.polarity_scores(x)['neu']) | |
| df['pos'] = df['text_processed'].apply(lambda x:analyzer.polarity_scores(x)['pos']) | |
| df['compound'] = df['text_processed'].apply(lambda x:analyzer.polarity_scores(x)['compound']) | |
| df['predicted_class'] = df['compound'].map(predicted_label) | |
| return df | |
| def vader_sentiment(articles_processed): | |
| from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
| analyzer = SentimentIntensityAnalyzer() | |
| articles_processed=score_Vader(articles_processed, analyzer) | |
| return articles_processed | |
| def sentiment_analysis(company, day): | |
| articles_df = fetching_news(company) | |
| articles_df = select_oneday_news(articles_df, day) | |
| articles_df = articles_df.loc[articles_df['ticker'] == company.upper()] | |
| # articles_processed = nlp_processing(articles_df) | |
| # articles_sentimentalized = vader_sentiment(articles_processed) | |
| return articles_df | |
| ## Aggregate News Sentiments Each Day | |
| def aggregate_by_date(articles_sentiments): | |
| articles_sentiments = change_date_format(articles_sentiments) | |
| keep_columns = ['ticker', 'publish_date', 'neg', 'neu', 'pos', 'compound'] | |
| sentiment_df = articles_sentiments[keep_columns] | |
| daily_sentiment = sentiment_df.groupby([sentiment_df['publish_date'].dt.date, 'ticker']).agg({'neg': 'mean', 'neu': 'mean', 'pos': 'mean', 'compound': 'mean'}).reset_index() | |
| return daily_sentiment | |