Spaces:
Sleeping
Sleeping
File size: 6,921 Bytes
605fc75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
# Load the scraped combined stock data
all_stock_data = pd.read_csv('scraped_combined_stock_data.csv')
all_stock_data['Date'] = pd.to_datetime(all_stock_data['Date'])
# Ensure 'Close' is numeric
all_stock_data['Close'] = pd.to_numeric(all_stock_data['Close'], errors='coerce')
# Drop rows with NaN values in 'Close'
all_stock_data = all_stock_data.dropna(subset=['Close'])
# Load Reddit data
reddit_data = pd.read_csv('cleaned_reddit_data.csv')
reddit_data['Date'] = pd.to_datetime(reddit_data['Date']) # Ensure datetime format
# Aggregate sentiment scores for Reddit
reddit_sentiment = reddit_data.groupby('Date')['sentiment'].mean().reset_index()
reddit_sentiment.rename(columns={'sentiment': 'reddit_sentiment'}, inplace=True)
# Load Twitter data
twitter_data = pd.read_csv('cleaned_twitter_data.csv')
twitter_data['Date'] = pd.to_datetime(twitter_data['Date']) # Ensure datetime format
# Aggregate sentiment scores for Twitter
twitter_sentiment = twitter_data.groupby('Date')['sentiment'].mean().reset_index()
twitter_sentiment.rename(columns={'sentiment': 'twitter_sentiment'}, inplace=True)
# Remove timezone information from 'Date' columns for merging
all_stock_data['Date'] = all_stock_data['Date'].dt.tz_localize(None)
reddit_sentiment['Date'] = reddit_sentiment['Date'].dt.tz_localize(None)
twitter_sentiment['Date'] = twitter_sentiment['Date'].dt.tz_localize(None)
# Merge Reddit and Twitter sentiment with stock data
all_stock_data = pd.merge(all_stock_data, reddit_sentiment, on='Date', how='left')
all_stock_data = pd.merge(all_stock_data, twitter_sentiment, on='Date', how='left')
# Fill missing sentiment scores with neutral sentiment (e.g., 0)
all_stock_data['reddit_sentiment'] = all_stock_data['reddit_sentiment'].fillna(0)
all_stock_data['twitter_sentiment'] = all_stock_data['twitter_sentiment'].fillna(0)
# Sequence creation for LSTM with multiple features
def create_sequences_multifeature(data, time_steps=60):
X, y = [], []
for i in range(len(data) - time_steps):
X.append(data[i:i + time_steps])
y.append(data[i + time_steps, 0]) # Assuming 'Close' is the target
return np.array(X), np.array(y)
# Preprocessing function for LSTM with sentiment data
def preprocess_for_lstm_multifeature(stock_data, stock_type, time_steps=60):
stock_data = stock_data[stock_data['Stock_Type'] == stock_type]
features = stock_data[['Close', 'reddit_sentiment', 'twitter_sentiment']].values
# Scale features
scaler = MinMaxScaler(feature_range=(0, 1))
features_scaled = scaler.fit_transform(features)
if len(features_scaled) > time_steps:
X, y = create_sequences_multifeature(features_scaled, time_steps)
X = X.reshape(X.shape[0], X.shape[1], features_scaled.shape[1])
return X, y, scaler
else:
raise ValueError(f"{stock_type}: Insufficient data: {len(features_scaled)} rows available, {time_steps} required.")
# Function to predict and evaluate
def predict_and_evaluate(stock_to_predict, scaler, time_steps=60):
stock_prices = all_stock_data[all_stock_data['Stock_Type'] == stock_to_predict][['Close', 'reddit_sentiment', 'twitter_sentiment']].values
if len(stock_prices) >= time_steps:
# Prepare data for prediction
last_60_features = stock_prices[-time_steps:]
# Scale the features
last_60_scaled = scaler.transform(last_60_features).reshape(1, -1, last_60_features.shape[1])
# Load the saved model
model = load_model(f'lstm_{stock_to_predict}_model_with_sentiment.h5')
# Predict next day price
predicted_price_scaled = model.predict(last_60_scaled)
# Reshape the prediction to match the number of features
predicted_price_scaled_reshaped = np.concatenate((predicted_price_scaled, np.zeros((predicted_price_scaled.shape[0], 2))), axis=1)
# Inverse transform to get the predicted 'Close' price
predicted_price = scaler.inverse_transform(predicted_price_scaled_reshaped)
# Real data for MSE and other metrics
real_data = all_stock_data[all_stock_data['Stock_Type'] == stock_to_predict].iloc[-1]['Close']
# Evaluate different metrics
mse = mean_squared_error([real_data], [predicted_price[0][0]]) # Wrap both in lists
mae = mean_absolute_error([real_data], [predicted_price[0][0]]) # Wrap both in lists
rmse = np.sqrt(mse)
# Adjusted R-squared for single prediction
def calculate_r2(real_data, predicted_data):
if len(real_data) > 1:
return r2_score(real_data, predicted_data)
else:
return np.nan # Return NaN if R² cannot be calculated
r2 = calculate_r2([real_data], [predicted_price[0][0]])
# Plot the predicted vs. actual prices
plt.figure(figsize=(12, 6))
plt.plot([real_data], label='Real Price', color='blue', marker='o')
plt.plot([predicted_price[0][0]], label='Predicted Price', color='red', marker='x')
plt.title(f"{stock_to_predict} - Predicted vs Actual Price")
plt.legend()
plt.grid(True)
plt.show()
# Plot the performance metrics
metrics = ['MSE', 'MAE', 'RMSE', 'R²']
values = [mse, mae, rmse, r2]
plt.figure(figsize=(8, 6))
plt.bar(metrics, values, color=['blue', 'green', 'red', 'purple'])
plt.title(f"{stock_to_predict} - Performance Metrics")
plt.ylabel('Values')
plt.show()
print(f"Predicted Next Day Price for {stock_to_predict}: {predicted_price[0][0]}")
print(f"Mean Squared Error (MSE) for {stock_to_predict}: {mse}")
print(f"Mean Absolute Error (MAE) for {stock_to_predict}: {mae}")
print(f"Root Mean Squared Error (RMSE) for {stock_to_predict}: {rmse}")
print(f"R-squared (R²) for {stock_to_predict}: {r2}")
else:
print(f"Insufficient data for prediction: {len(stock_prices)} rows available, {time_steps} required.")
# Allow user to input which stock to predict
stock_to_predict = input("Enter the stock symbol to predict (e.g., 'AAPL'): ")
try:
# Get the scaler from preprocessing
_, _, scaler = preprocess_for_lstm_multifeature(all_stock_data, stock_to_predict)
# Perform prediction and evaluation
predict_and_evaluate(stock_to_predict, scaler)
except ValueError as e:
print(e)
|