Spaces:

roshcheeku
/

STOCKBACK

Sleeping

App Files Files Community

STOCKBACK / datapreprocess.py

roshcheeku

Upload 56 files

605fc75 verified 7 months ago

raw

history blame contribute delete

2.87 kB

	import pandas as pd
	import re
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.stem import WordNetLemmatizer
	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

	# Download necessary NLTK data
	nltk.download('punkt')
	nltk.download('stopwords')
	nltk.download('wordnet')
	nltk.download('punkt_tab')

	# Initialize SentimentIntensityAnalyzer and Lemmatizer
	sia = SentimentIntensityAnalyzer()
	lemmatizer = WordNetLemmatizer()

	# Load your datasets (adjust path if needed)
	twitter_aapl_tweets = pd.read_csv("aapl_tweets.csv") # Example CSV file for Twitter data
	reddit_aapl_posts = pd.read_csv("reddit_aapl_posts.csv") # Example CSV file for Reddit data

	# Print columns to verify correct data loading
	print("Twitter columns:", twitter_aapl_tweets.columns)
	print("Reddit columns:", reddit_aapl_posts.columns)

	# Function to clean the text
	def clean_text(text):
	# Remove URLs, mentions, hashtags, special characters, and lower the text
	text = re.sub(r'http\S+\|www\S+\|https\S+', '', text) # Remove URLs
	text = re.sub(r'@\w+', '', text) # Remove mentions
	text = re.sub(r'#\w+', '', text) # Remove hashtags
	text = re.sub(r'[^A-Za-z0-9\s]', '', text) # Remove special characters
	text = text.lower() # Convert text to lowercase
	text = word_tokenize(text) # Tokenize the text

	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	text = [word for word in text if word not in stop_words]

	# Lemmatize words
	text = [lemmatizer.lemmatize(word) for word in text]

	# Join the words back into a string
	return ' '.join(text)

	# Function to get sentiment score using Vader
	def get_sentiment(text):
	return sia.polarity_scores(text)['compound']

	# Apply cleaning and sentiment analysis to Twitter data
	twitter_aapl_tweets['cleaned_text'] = twitter_aapl_tweets['Tweet'].apply(clean_text)
	twitter_aapl_tweets['sentiment'] = twitter_aapl_tweets['cleaned_text'].apply(get_sentiment)

	# Print the cleaned Twitter data
	print("\nCleaned Twitter Data:")
	print(twitter_aapl_tweets[['Tweet', 'cleaned_text', 'sentiment']].head())

	# Apply cleaning and sentiment analysis to Reddit data
	# Assuming 'Title' contains the post text (adjust to 'Comments' if needed)
	reddit_aapl_posts['cleaned_text'] = reddit_aapl_posts['Title'].apply(clean_text) # Or 'Comments'
	reddit_aapl_posts['sentiment'] = reddit_aapl_posts['cleaned_text'].apply(get_sentiment)

	# Print the cleaned Reddit data
	print("\nCleaned Reddit Data:")
	print(reddit_aapl_posts[['Title', 'cleaned_text', 'sentiment']].head())

	# Save cleaned data to CSV
	twitter_aapl_tweets.to_csv("cleaned_twitter_data.csv", index=False)
	reddit_aapl_posts.to_csv("cleaned_reddit_data.csv", index=False)

	print("\nData cleaned and saved to CSV.")