import pandas as pd import re import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer # Download necessary NLTK data nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') nltk.download('punkt_tab') # Initialize SentimentIntensityAnalyzer and Lemmatizer sia = SentimentIntensityAnalyzer() lemmatizer = WordNetLemmatizer() # Load your datasets (adjust path if needed) twitter_aapl_tweets = pd.read_csv("aapl_tweets.csv") # Example CSV file for Twitter data reddit_aapl_posts = pd.read_csv("reddit_aapl_posts.csv") # Example CSV file for Reddit data # Print columns to verify correct data loading print("Twitter columns:", twitter_aapl_tweets.columns) print("Reddit columns:", reddit_aapl_posts.columns) # Function to clean the text def clean_text(text): # Remove URLs, mentions, hashtags, special characters, and lower the text text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove URLs text = re.sub(r'@\w+', '', text) # Remove mentions text = re.sub(r'#\w+', '', text) # Remove hashtags text = re.sub(r'[^A-Za-z0-9\s]', '', text) # Remove special characters text = text.lower() # Convert text to lowercase text = word_tokenize(text) # Tokenize the text # Remove stopwords stop_words = set(stopwords.words('english')) text = [word for word in text if word not in stop_words] # Lemmatize words text = [lemmatizer.lemmatize(word) for word in text] # Join the words back into a string return ' '.join(text) # Function to get sentiment score using Vader def get_sentiment(text): return sia.polarity_scores(text)['compound'] # Apply cleaning and sentiment analysis to Twitter data twitter_aapl_tweets['cleaned_text'] = twitter_aapl_tweets['Tweet'].apply(clean_text) twitter_aapl_tweets['sentiment'] = twitter_aapl_tweets['cleaned_text'].apply(get_sentiment) # Print the cleaned Twitter data print("\nCleaned Twitter Data:") print(twitter_aapl_tweets[['Tweet', 'cleaned_text', 'sentiment']].head()) # Apply cleaning and sentiment analysis to Reddit data # Assuming 'Title' contains the post text (adjust to 'Comments' if needed) reddit_aapl_posts['cleaned_text'] = reddit_aapl_posts['Title'].apply(clean_text) # Or 'Comments' reddit_aapl_posts['sentiment'] = reddit_aapl_posts['cleaned_text'].apply(get_sentiment) # Print the cleaned Reddit data print("\nCleaned Reddit Data:") print(reddit_aapl_posts[['Title', 'cleaned_text', 'sentiment']].head()) # Save cleaned data to CSV twitter_aapl_tweets.to_csv("cleaned_twitter_data.csv", index=False) reddit_aapl_posts.to_csv("cleaned_reddit_data.csv", index=False) print("\nData cleaned and saved to CSV.")