File size: 6,921 Bytes
605fc75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

# Load the scraped combined stock data
all_stock_data = pd.read_csv('scraped_combined_stock_data.csv')
all_stock_data['Date'] = pd.to_datetime(all_stock_data['Date'])

# Ensure 'Close' is numeric
all_stock_data['Close'] = pd.to_numeric(all_stock_data['Close'], errors='coerce')

# Drop rows with NaN values in 'Close'
all_stock_data = all_stock_data.dropna(subset=['Close'])

# Load Reddit data
reddit_data = pd.read_csv('cleaned_reddit_data.csv')
reddit_data['Date'] = pd.to_datetime(reddit_data['Date'])  # Ensure datetime format

# Aggregate sentiment scores for Reddit
reddit_sentiment = reddit_data.groupby('Date')['sentiment'].mean().reset_index()
reddit_sentiment.rename(columns={'sentiment': 'reddit_sentiment'}, inplace=True)

# Load Twitter data
twitter_data = pd.read_csv('cleaned_twitter_data.csv')
twitter_data['Date'] = pd.to_datetime(twitter_data['Date'])  # Ensure datetime format

# Aggregate sentiment scores for Twitter
twitter_sentiment = twitter_data.groupby('Date')['sentiment'].mean().reset_index()
twitter_sentiment.rename(columns={'sentiment': 'twitter_sentiment'}, inplace=True)

# Remove timezone information from 'Date' columns for merging
all_stock_data['Date'] = all_stock_data['Date'].dt.tz_localize(None)
reddit_sentiment['Date'] = reddit_sentiment['Date'].dt.tz_localize(None)
twitter_sentiment['Date'] = twitter_sentiment['Date'].dt.tz_localize(None)

# Merge Reddit and Twitter sentiment with stock data
all_stock_data = pd.merge(all_stock_data, reddit_sentiment, on='Date', how='left')
all_stock_data = pd.merge(all_stock_data, twitter_sentiment, on='Date', how='left')

# Fill missing sentiment scores with neutral sentiment (e.g., 0)
all_stock_data['reddit_sentiment'] = all_stock_data['reddit_sentiment'].fillna(0)
all_stock_data['twitter_sentiment'] = all_stock_data['twitter_sentiment'].fillna(0)

# Sequence creation for LSTM with multiple features
def create_sequences_multifeature(data, time_steps=60):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:i + time_steps])
        y.append(data[i + time_steps, 0])  # Assuming 'Close' is the target
    return np.array(X), np.array(y)

# Preprocessing function for LSTM with sentiment data
def preprocess_for_lstm_multifeature(stock_data, stock_type, time_steps=60):
    stock_data = stock_data[stock_data['Stock_Type'] == stock_type]
    features = stock_data[['Close', 'reddit_sentiment', 'twitter_sentiment']].values
    
    # Scale features
    scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = scaler.fit_transform(features)
    
    if len(features_scaled) > time_steps:
        X, y = create_sequences_multifeature(features_scaled, time_steps)
        X = X.reshape(X.shape[0], X.shape[1], features_scaled.shape[1])
        return X, y, scaler
    else:
        raise ValueError(f"{stock_type}: Insufficient data: {len(features_scaled)} rows available, {time_steps} required.")

# Function to predict and evaluate
def predict_and_evaluate(stock_to_predict, scaler, time_steps=60):
    stock_prices = all_stock_data[all_stock_data['Stock_Type'] == stock_to_predict][['Close', 'reddit_sentiment', 'twitter_sentiment']].values
    if len(stock_prices) >= time_steps:
        # Prepare data for prediction
        last_60_features = stock_prices[-time_steps:]
        
        # Scale the features
        last_60_scaled = scaler.transform(last_60_features).reshape(1, -1, last_60_features.shape[1])
        
        # Load the saved model
        model = load_model(f'lstm_{stock_to_predict}_model_with_sentiment.h5')
        
        # Predict next day price
        predicted_price_scaled = model.predict(last_60_scaled)
        
        # Reshape the prediction to match the number of features
        predicted_price_scaled_reshaped = np.concatenate((predicted_price_scaled, np.zeros((predicted_price_scaled.shape[0], 2))), axis=1)
        
        # Inverse transform to get the predicted 'Close' price
        predicted_price = scaler.inverse_transform(predicted_price_scaled_reshaped)
        
        # Real data for MSE and other metrics
        real_data = all_stock_data[all_stock_data['Stock_Type'] == stock_to_predict].iloc[-1]['Close']
        
        # Evaluate different metrics
        mse = mean_squared_error([real_data], [predicted_price[0][0]])  # Wrap both in lists
        mae = mean_absolute_error([real_data], [predicted_price[0][0]])  # Wrap both in lists
        rmse = np.sqrt(mse)
        
        # Adjusted R-squared for single prediction
        def calculate_r2(real_data, predicted_data):
            if len(real_data) > 1:
                return r2_score(real_data, predicted_data)
            else:
                return np.nan  # Return NaN if R² cannot be calculated

        r2 = calculate_r2([real_data], [predicted_price[0][0]])

        # Plot the predicted vs. actual prices
        plt.figure(figsize=(12, 6))
        plt.plot([real_data], label='Real Price', color='blue', marker='o')
        plt.plot([predicted_price[0][0]], label='Predicted Price', color='red', marker='x')
        plt.title(f"{stock_to_predict} - Predicted vs Actual Price")
        plt.legend()
        plt.grid(True)
        plt.show()

        # Plot the performance metrics
        metrics = ['MSE', 'MAE', 'RMSE', 'R²']
        values = [mse, mae, rmse, r2]
        
        plt.figure(figsize=(8, 6))
        plt.bar(metrics, values, color=['blue', 'green', 'red', 'purple'])
        plt.title(f"{stock_to_predict} - Performance Metrics")
        plt.ylabel('Values')
        plt.show()

        print(f"Predicted Next Day Price for {stock_to_predict}: {predicted_price[0][0]}")
        print(f"Mean Squared Error (MSE) for {stock_to_predict}: {mse}")
        print(f"Mean Absolute Error (MAE) for {stock_to_predict}: {mae}")
        print(f"Root Mean Squared Error (RMSE) for {stock_to_predict}: {rmse}")
        print(f"R-squared (R²) for {stock_to_predict}: {r2}")
    else:
        print(f"Insufficient data for prediction: {len(stock_prices)} rows available, {time_steps} required.")

# Allow user to input which stock to predict
stock_to_predict = input("Enter the stock symbol to predict (e.g., 'AAPL'): ")

try:
    # Get the scaler from preprocessing
    _, _, scaler = preprocess_for_lstm_multifeature(all_stock_data, stock_to_predict)
    
    # Perform prediction and evaluation
    predict_and_evaluate(stock_to_predict, scaler)
    
except ValueError as e:
    print(e)