File size: 5,816 Bytes
605fc75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import ta
from datetime import datetime
import yfinance as yf
import matplotlib.pyplot as plt

# Function to calculate VADER sentiment scores
def calculate_vader_sentiment(text):
    """Calculate VADER sentiment scores for given text."""
    try:
        sia = SentimentIntensityAnalyzer()
        return sia.polarity_scores(str(text))['compound']
    except Exception as e:
        print(f"Error calculating sentiment: {e}")
        return 0

# Function to download stock data
def get_stock_data(ticker, start_date, end_date):
    """Download stock data using yfinance."""
    try:
        stock = yf.download(ticker, start=start_date, end=end_date)
        stock.reset_index(inplace=True)
        print(f"\nDownloaded {ticker} stock data:")
        print(f"Date range: {stock['Date'].min()} to {stock['Date'].max()}")
        print(f"Number of records: {len(stock)}")
        return stock
    except Exception as e:
        print(f"Error downloading stock data: {e}")
        return None

def preprocess_stock_data(stock_symbol, sentiment_file, start_date='2023-01-01'):
    """

    Preprocess stock and sentiment data with proper error checking.

    """
    print(f"\nProcessing data for {stock_symbol}:")

    # Load sentiment data
    sentiment_data = pd.read_csv(sentiment_file)
    print(f"\nLoaded sentiment data from {sentiment_file}")
    print(f"Columns available: {sentiment_data.columns.tolist()}")

    # Calculate VADER sentiment if not already present
    if 'cleaned_text' in sentiment_data.columns:
        print("Calculating VADER sentiment scores...")
        sentiment_data['vader_avg_sentiment'] = sentiment_data['cleaned_text'].apply(calculate_vader_sentiment)

    # Convert sentiment data dates to datetime without timezone
    sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date']).dt.tz_localize(None)

    # Download stock data
    end_date = datetime.now().strftime('%Y-%m-%d')
    stock_data = get_stock_data(stock_symbol, start_date, end_date)

    if stock_data is None:
        raise ValueError(f"Failed to download stock data for {stock_symbol}")

    # Flatten column names if needed
    if isinstance(stock_data.columns, pd.MultiIndex):
        stock_data.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in stock_data.columns]

    # Debugging: Check stock data structure
    print("\nStock data columns:", stock_data.columns.tolist())
    print("First few rows of stock data:")
    print(stock_data.head())

    # Rename 'Date_' to 'Date' if present
    if 'Date_' in stock_data.columns:
        stock_data.rename(columns={'Date_': 'Date'}, inplace=True)

    # Check and rename the 'Close' column
    close_column = f"Close_{stock_symbol}" if f"Close_{stock_symbol}" in stock_data.columns else "Close"
    if close_column not in stock_data.columns:
        raise ValueError(f"'{close_column}' column not found in stock data. Available columns: {stock_data.columns.tolist()}")
    stock_data.rename(columns={close_column: 'close'}, inplace=True)

    # Ensure 'Date' column exists
    if 'Date' not in stock_data.columns:
        raise ValueError("'Date' column not found in stock data after renaming.")

    # Convert stock data dates to datetime without timezone
    stock_data['Date'] = pd.to_datetime(stock_data['Date']).dt.tz_localize(None)

    # Ensure 'close' is a numeric Series
    print("\nConverting 'close' column to numeric...")
    stock_data['close'] = pd.to_numeric(stock_data['close'], errors='coerce')
    if stock_data['close'].isnull().all():
        raise ValueError("'close' column contains all NaN values after conversion.")

    # Calculate SMA
    stock_data['SMA_50'] = ta.trend.sma_indicator(stock_data['close'], window=50)
    stock_data['SMA_200'] = ta.trend.sma_indicator(stock_data['close'], window=200)

    # Merge stock and sentiment data
    merged_data = pd.merge_asof(
        stock_data.sort_values('Date'),
        sentiment_data.sort_values('Date'),
        on='Date'
    )
    return merged_data

# Function to plot stock and sentiment data
def plot_stock_with_sentiment(data, stock_name):
    """Plot stock price and sentiment data."""
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10), height_ratios=[3, 1])

    # Stock price plot
    ax1.plot(data['Date'], data['close'], label='Close Price')
    ax1.plot(data['Date'], data['SMA_50'], label='50-day SMA')
    ax1.plot(data['Date'], data['SMA_200'], label='200-day SMA')
    ax1.set_title(f'{stock_name} Stock Price with Technical Indicators')
    ax1.set_ylabel('Price')
    ax1.legend()

    # Sentiment plot
    ax2.plot(data['Date'], data['vader_avg_sentiment'], label='Sentiment', color='purple')
    ax2.fill_between(data['Date'], data['vader_avg_sentiment'], 0, alpha=0.2)
    ax2.set_title('Sentiment Score')
    ax2.set_ylabel('VADER Sentiment')

    plt.tight_layout()
    plt.show()

# Main execution
if __name__ == "__main__":
    try:
        print("Starting data preprocessing...")

        # Process AAPL data
        aapl_data = preprocess_stock_data(
            'AAPL',
            'cleaned_twitter_data.csv'
        )

        # Process TSLA data
        tsla_data = preprocess_stock_data(
            'TSLA',
            'cleaned_reddit_data.csv'
        )

        # Plot the processed data
        print("\nGenerating plots...")
        plot_stock_with_sentiment(aapl_data, 'AAPL')
        plot_stock_with_sentiment(tsla_data, 'TSLA')

    except Exception as e:
        print(f"\nError during processing: {e}")
        import traceback
        traceback.print_exc()