import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Initialize SentimentIntensityAnalyzer and Lemmatizer
sia = SentimentIntensityAnalyzer()
lemmatizer = WordNetLemmatizer()

# Load your datasets (adjust path if needed)
twitter_aapl_tweets = pd.read_csv("aapl_tweets.csv")  # Example CSV file for Twitter data
reddit_aapl_posts = pd.read_csv("reddit_aapl_posts.csv")  # Example CSV file for Reddit data

# Print columns to verify correct data loading
print("Twitter columns:", twitter_aapl_tweets.columns)
print("Reddit columns:", reddit_aapl_posts.columns)

# Function to clean the text
def clean_text(text):
    # Remove URLs, mentions, hashtags, special characters, and lower the text
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert text to lowercase
    text = word_tokenize(text)  # Tokenize the text

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    
    # Lemmatize words
    text = [lemmatizer.lemmatize(word) for word in text]

    # Join the words back into a string
    return ' '.join(text)

# Function to get sentiment score using Vader
def get_sentiment(text):
    return sia.polarity_scores(text)['compound']

# Apply cleaning and sentiment analysis to Twitter data
twitter_aapl_tweets['cleaned_text'] = twitter_aapl_tweets['Tweet'].apply(clean_text)
twitter_aapl_tweets['sentiment'] = twitter_aapl_tweets['cleaned_text'].apply(get_sentiment)

# Print the cleaned Twitter data
print("\nCleaned Twitter Data:")
print(twitter_aapl_tweets[['Tweet', 'cleaned_text', 'sentiment']].head())

# Apply cleaning and sentiment analysis to Reddit data
# Assuming 'Title' contains the post text (adjust to 'Comments' if needed)
reddit_aapl_posts['cleaned_text'] = reddit_aapl_posts['Title'].apply(clean_text)  # Or 'Comments'
reddit_aapl_posts['sentiment'] = reddit_aapl_posts['cleaned_text'].apply(get_sentiment)

# Print the cleaned Reddit data
print("\nCleaned Reddit Data:")
print(reddit_aapl_posts[['Title', 'cleaned_text', 'sentiment']].head())

# Save cleaned data to CSV
twitter_aapl_tweets.to_csv("cleaned_twitter_data.csv", index=False)
reddit_aapl_posts.to_csv("cleaned_reddit_data.csv", index=False)

print("\nData cleaned and saved to CSV.")