import pandas as pd import numpy as np from nltk.sentiment.vader import SentimentIntensityAnalyzer import ta from datetime import datetime import yfinance as yf import matplotlib.pyplot as plt # Function to calculate VADER sentiment scores def calculate_vader_sentiment(text): """Calculate VADER sentiment scores for given text.""" try: sia = SentimentIntensityAnalyzer() return sia.polarity_scores(str(text))['compound'] except Exception as e: print(f"Error calculating sentiment: {e}") return 0 # Function to download stock data def get_stock_data(ticker, start_date, end_date): """Download stock data using yfinance.""" try: stock = yf.download(ticker, start=start_date, end=end_date) stock.reset_index(inplace=True) print(f"\nDownloaded {ticker} stock data:") print(f"Date range: {stock['Date'].min()} to {stock['Date'].max()}") print(f"Number of records: {len(stock)}") return stock except Exception as e: print(f"Error downloading stock data: {e}") return None def preprocess_stock_data(stock_symbol, sentiment_file, start_date='2023-01-01'): """ Preprocess stock and sentiment data with proper error checking. """ print(f"\nProcessing data for {stock_symbol}:") # Load sentiment data sentiment_data = pd.read_csv(sentiment_file) print(f"\nLoaded sentiment data from {sentiment_file}") print(f"Columns available: {sentiment_data.columns.tolist()}") # Calculate VADER sentiment if not already present if 'cleaned_text' in sentiment_data.columns: print("Calculating VADER sentiment scores...") sentiment_data['vader_avg_sentiment'] = sentiment_data['cleaned_text'].apply(calculate_vader_sentiment) # Convert sentiment data dates to datetime without timezone sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date']).dt.tz_localize(None) # Download stock data end_date = datetime.now().strftime('%Y-%m-%d') stock_data = get_stock_data(stock_symbol, start_date, end_date) if stock_data is None: raise ValueError(f"Failed to download stock data for {stock_symbol}") # Flatten column names if needed if isinstance(stock_data.columns, pd.MultiIndex): stock_data.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in stock_data.columns] # Debugging: Check stock data structure print("\nStock data columns:", stock_data.columns.tolist()) print("First few rows of stock data:") print(stock_data.head()) # Rename 'Date_' to 'Date' if present if 'Date_' in stock_data.columns: stock_data.rename(columns={'Date_': 'Date'}, inplace=True) # Check and rename the 'Close' column close_column = f"Close_{stock_symbol}" if f"Close_{stock_symbol}" in stock_data.columns else "Close" if close_column not in stock_data.columns: raise ValueError(f"'{close_column}' column not found in stock data. Available columns: {stock_data.columns.tolist()}") stock_data.rename(columns={close_column: 'close'}, inplace=True) # Ensure 'Date' column exists if 'Date' not in stock_data.columns: raise ValueError("'Date' column not found in stock data after renaming.") # Convert stock data dates to datetime without timezone stock_data['Date'] = pd.to_datetime(stock_data['Date']).dt.tz_localize(None) # Ensure 'close' is a numeric Series print("\nConverting 'close' column to numeric...") stock_data['close'] = pd.to_numeric(stock_data['close'], errors='coerce') if stock_data['close'].isnull().all(): raise ValueError("'close' column contains all NaN values after conversion.") # Calculate SMA stock_data['SMA_50'] = ta.trend.sma_indicator(stock_data['close'], window=50) stock_data['SMA_200'] = ta.trend.sma_indicator(stock_data['close'], window=200) # Merge stock and sentiment data merged_data = pd.merge_asof( stock_data.sort_values('Date'), sentiment_data.sort_values('Date'), on='Date' ) return merged_data # Function to plot stock and sentiment data def plot_stock_with_sentiment(data, stock_name): """Plot stock price and sentiment data.""" fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10), height_ratios=[3, 1]) # Stock price plot ax1.plot(data['Date'], data['close'], label='Close Price') ax1.plot(data['Date'], data['SMA_50'], label='50-day SMA') ax1.plot(data['Date'], data['SMA_200'], label='200-day SMA') ax1.set_title(f'{stock_name} Stock Price with Technical Indicators') ax1.set_ylabel('Price') ax1.legend() # Sentiment plot ax2.plot(data['Date'], data['vader_avg_sentiment'], label='Sentiment', color='purple') ax2.fill_between(data['Date'], data['vader_avg_sentiment'], 0, alpha=0.2) ax2.set_title('Sentiment Score') ax2.set_ylabel('VADER Sentiment') plt.tight_layout() plt.show() # Main execution if __name__ == "__main__": try: print("Starting data preprocessing...") # Process AAPL data aapl_data = preprocess_stock_data( 'AAPL', 'cleaned_twitter_data.csv' ) # Process TSLA data tsla_data = preprocess_stock_data( 'TSLA', 'cleaned_reddit_data.csv' ) # Plot the processed data print("\nGenerating plots...") plot_stock_with_sentiment(aapl_data, 'AAPL') plot_stock_with_sentiment(tsla_data, 'TSLA') except Exception as e: print(f"\nError during processing: {e}") import traceback traceback.print_exc()