import pandas as pd import numpy as np from sklearn.preprocessing import MinMaxScaler from tensorflow.keras.models import Sequential, load_model from tensorflow.keras.layers import LSTM, Dense, Dropout from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score import matplotlib.pyplot as plt # Load the scraped combined stock data all_stock_data = pd.read_csv('scraped_combined_stock_data.csv') all_stock_data['Date'] = pd.to_datetime(all_stock_data['Date']) # Ensure 'Close' is numeric all_stock_data['Close'] = pd.to_numeric(all_stock_data['Close'], errors='coerce') # Drop rows with NaN values in 'Close' all_stock_data = all_stock_data.dropna(subset=['Close']) # Load Reddit data reddit_data = pd.read_csv('cleaned_reddit_data.csv') reddit_data['Date'] = pd.to_datetime(reddit_data['Date']) # Ensure datetime format # Aggregate sentiment scores for Reddit reddit_sentiment = reddit_data.groupby('Date')['sentiment'].mean().reset_index() reddit_sentiment.rename(columns={'sentiment': 'reddit_sentiment'}, inplace=True) # Load Twitter data twitter_data = pd.read_csv('cleaned_twitter_data.csv') twitter_data['Date'] = pd.to_datetime(twitter_data['Date']) # Ensure datetime format # Aggregate sentiment scores for Twitter twitter_sentiment = twitter_data.groupby('Date')['sentiment'].mean().reset_index() twitter_sentiment.rename(columns={'sentiment': 'twitter_sentiment'}, inplace=True) # Remove timezone information from 'Date' columns for merging all_stock_data['Date'] = all_stock_data['Date'].dt.tz_localize(None) reddit_sentiment['Date'] = reddit_sentiment['Date'].dt.tz_localize(None) twitter_sentiment['Date'] = twitter_sentiment['Date'].dt.tz_localize(None) # Merge Reddit and Twitter sentiment with stock data all_stock_data = pd.merge(all_stock_data, reddit_sentiment, on='Date', how='left') all_stock_data = pd.merge(all_stock_data, twitter_sentiment, on='Date', how='left') # Fill missing sentiment scores with neutral sentiment (e.g., 0) all_stock_data['reddit_sentiment'] = all_stock_data['reddit_sentiment'].fillna(0) all_stock_data['twitter_sentiment'] = all_stock_data['twitter_sentiment'].fillna(0) # Sequence creation for LSTM with multiple features def create_sequences_multifeature(data, time_steps=60): X, y = [], [] for i in range(len(data) - time_steps): X.append(data[i:i + time_steps]) y.append(data[i + time_steps, 0]) # Assuming 'Close' is the target return np.array(X), np.array(y) # Preprocessing function for LSTM with sentiment data def preprocess_for_lstm_multifeature(stock_data, stock_type, time_steps=60): stock_data = stock_data[stock_data['Stock_Type'] == stock_type] features = stock_data[['Close', 'reddit_sentiment', 'twitter_sentiment']].values # Scale features scaler = MinMaxScaler(feature_range=(0, 1)) features_scaled = scaler.fit_transform(features) if len(features_scaled) > time_steps: X, y = create_sequences_multifeature(features_scaled, time_steps) X = X.reshape(X.shape[0], X.shape[1], features_scaled.shape[1]) return X, y, scaler else: raise ValueError(f"{stock_type}: Insufficient data: {len(features_scaled)} rows available, {time_steps} required.") # Function to predict and evaluate def predict_and_evaluate(stock_to_predict, scaler, time_steps=60): stock_prices = all_stock_data[all_stock_data['Stock_Type'] == stock_to_predict][['Close', 'reddit_sentiment', 'twitter_sentiment']].values if len(stock_prices) >= time_steps: # Prepare data for prediction last_60_features = stock_prices[-time_steps:] # Scale the features last_60_scaled = scaler.transform(last_60_features).reshape(1, -1, last_60_features.shape[1]) # Load the saved model model = load_model(f'lstm_{stock_to_predict}_model_with_sentiment.h5') # Predict next day price predicted_price_scaled = model.predict(last_60_scaled) # Reshape the prediction to match the number of features predicted_price_scaled_reshaped = np.concatenate((predicted_price_scaled, np.zeros((predicted_price_scaled.shape[0], 2))), axis=1) # Inverse transform to get the predicted 'Close' price predicted_price = scaler.inverse_transform(predicted_price_scaled_reshaped) # Real data for MSE and other metrics real_data = all_stock_data[all_stock_data['Stock_Type'] == stock_to_predict].iloc[-1]['Close'] # Evaluate different metrics mse = mean_squared_error([real_data], [predicted_price[0][0]]) # Wrap both in lists mae = mean_absolute_error([real_data], [predicted_price[0][0]]) # Wrap both in lists rmse = np.sqrt(mse) # Adjusted R-squared for single prediction def calculate_r2(real_data, predicted_data): if len(real_data) > 1: return r2_score(real_data, predicted_data) else: return np.nan # Return NaN if R² cannot be calculated r2 = calculate_r2([real_data], [predicted_price[0][0]]) # Plot the predicted vs. actual prices plt.figure(figsize=(12, 6)) plt.plot([real_data], label='Real Price', color='blue', marker='o') plt.plot([predicted_price[0][0]], label='Predicted Price', color='red', marker='x') plt.title(f"{stock_to_predict} - Predicted vs Actual Price") plt.legend() plt.grid(True) plt.show() # Plot the performance metrics metrics = ['MSE', 'MAE', 'RMSE', 'R²'] values = [mse, mae, rmse, r2] plt.figure(figsize=(8, 6)) plt.bar(metrics, values, color=['blue', 'green', 'red', 'purple']) plt.title(f"{stock_to_predict} - Performance Metrics") plt.ylabel('Values') plt.show() print(f"Predicted Next Day Price for {stock_to_predict}: {predicted_price[0][0]}") print(f"Mean Squared Error (MSE) for {stock_to_predict}: {mse}") print(f"Mean Absolute Error (MAE) for {stock_to_predict}: {mae}") print(f"Root Mean Squared Error (RMSE) for {stock_to_predict}: {rmse}") print(f"R-squared (R²) for {stock_to_predict}: {r2}") else: print(f"Insufficient data for prediction: {len(stock_prices)} rows available, {time_steps} required.") # Allow user to input which stock to predict stock_to_predict = input("Enter the stock symbol to predict (e.g., 'AAPL'): ") try: # Get the scaler from preprocessing _, _, scaler = preprocess_for_lstm_multifeature(all_stock_data, stock_to_predict) # Perform prediction and evaluation predict_and_evaluate(stock_to_predict, scaler) except ValueError as e: print(e)