STOCKBACK / datapreprocess.py
roshcheeku's picture
Upload 56 files
605fc75 verified
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
# Initialize SentimentIntensityAnalyzer and Lemmatizer
sia = SentimentIntensityAnalyzer()
lemmatizer = WordNetLemmatizer()
# Load your datasets (adjust path if needed)
twitter_aapl_tweets = pd.read_csv("aapl_tweets.csv") # Example CSV file for Twitter data
reddit_aapl_posts = pd.read_csv("reddit_aapl_posts.csv") # Example CSV file for Reddit data
# Print columns to verify correct data loading
print("Twitter columns:", twitter_aapl_tweets.columns)
print("Reddit columns:", reddit_aapl_posts.columns)
# Function to clean the text
def clean_text(text):
# Remove URLs, mentions, hashtags, special characters, and lower the text
text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove URLs
text = re.sub(r'@\w+', '', text) # Remove mentions
text = re.sub(r'#\w+', '', text) # Remove hashtags
text = re.sub(r'[^A-Za-z0-9\s]', '', text) # Remove special characters
text = text.lower() # Convert text to lowercase
text = word_tokenize(text) # Tokenize the text
# Remove stopwords
stop_words = set(stopwords.words('english'))
text = [word for word in text if word not in stop_words]
# Lemmatize words
text = [lemmatizer.lemmatize(word) for word in text]
# Join the words back into a string
return ' '.join(text)
# Function to get sentiment score using Vader
def get_sentiment(text):
return sia.polarity_scores(text)['compound']
# Apply cleaning and sentiment analysis to Twitter data
twitter_aapl_tweets['cleaned_text'] = twitter_aapl_tweets['Tweet'].apply(clean_text)
twitter_aapl_tweets['sentiment'] = twitter_aapl_tweets['cleaned_text'].apply(get_sentiment)
# Print the cleaned Twitter data
print("\nCleaned Twitter Data:")
print(twitter_aapl_tweets[['Tweet', 'cleaned_text', 'sentiment']].head())
# Apply cleaning and sentiment analysis to Reddit data
# Assuming 'Title' contains the post text (adjust to 'Comments' if needed)
reddit_aapl_posts['cleaned_text'] = reddit_aapl_posts['Title'].apply(clean_text) # Or 'Comments'
reddit_aapl_posts['sentiment'] = reddit_aapl_posts['cleaned_text'].apply(get_sentiment)
# Print the cleaned Reddit data
print("\nCleaned Reddit Data:")
print(reddit_aapl_posts[['Title', 'cleaned_text', 'sentiment']].head())
# Save cleaned data to CSV
twitter_aapl_tweets.to_csv("cleaned_twitter_data.csv", index=False)
reddit_aapl_posts.to_csv("cleaned_reddit_data.csv", index=False)
print("\nData cleaned and saved to CSV.")