Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import MinMaxScaler | |
| from tensorflow.keras.models import Sequential, load_model | |
| from tensorflow.keras.layers import LSTM, Dense, Dropout | |
| from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score | |
| import matplotlib.pyplot as plt | |
| # Load the scraped combined stock data | |
| all_stock_data = pd.read_csv('scraped_combined_stock_data.csv') | |
| all_stock_data['Date'] = pd.to_datetime(all_stock_data['Date']) | |
| # Ensure 'Close' is numeric | |
| all_stock_data['Close'] = pd.to_numeric(all_stock_data['Close'], errors='coerce') | |
| # Drop rows with NaN values in 'Close' | |
| all_stock_data = all_stock_data.dropna(subset=['Close']) | |
| # Load Reddit data | |
| reddit_data = pd.read_csv('cleaned_reddit_data.csv') | |
| reddit_data['Date'] = pd.to_datetime(reddit_data['Date']) # Ensure datetime format | |
| # Aggregate sentiment scores for Reddit | |
| reddit_sentiment = reddit_data.groupby('Date')['sentiment'].mean().reset_index() | |
| reddit_sentiment.rename(columns={'sentiment': 'reddit_sentiment'}, inplace=True) | |
| # Load Twitter data | |
| twitter_data = pd.read_csv('cleaned_twitter_data.csv') | |
| twitter_data['Date'] = pd.to_datetime(twitter_data['Date']) # Ensure datetime format | |
| # Aggregate sentiment scores for Twitter | |
| twitter_sentiment = twitter_data.groupby('Date')['sentiment'].mean().reset_index() | |
| twitter_sentiment.rename(columns={'sentiment': 'twitter_sentiment'}, inplace=True) | |
| # Remove timezone information from 'Date' columns for merging | |
| all_stock_data['Date'] = all_stock_data['Date'].dt.tz_localize(None) | |
| reddit_sentiment['Date'] = reddit_sentiment['Date'].dt.tz_localize(None) | |
| twitter_sentiment['Date'] = twitter_sentiment['Date'].dt.tz_localize(None) | |
| # Merge Reddit and Twitter sentiment with stock data | |
| all_stock_data = pd.merge(all_stock_data, reddit_sentiment, on='Date', how='left') | |
| all_stock_data = pd.merge(all_stock_data, twitter_sentiment, on='Date', how='left') | |
| # Fill missing sentiment scores with neutral sentiment (e.g., 0) | |
| all_stock_data['reddit_sentiment'] = all_stock_data['reddit_sentiment'].fillna(0) | |
| all_stock_data['twitter_sentiment'] = all_stock_data['twitter_sentiment'].fillna(0) | |
| # Sequence creation for LSTM with multiple features | |
| def create_sequences_multifeature(data, time_steps=60): | |
| X, y = [], [] | |
| for i in range(len(data) - time_steps): | |
| X.append(data[i:i + time_steps]) | |
| y.append(data[i + time_steps, 0]) # Assuming 'Close' is the target | |
| return np.array(X), np.array(y) | |
| # Preprocessing function for LSTM with sentiment data | |
| def preprocess_for_lstm_multifeature(stock_data, stock_type, time_steps=60): | |
| stock_data = stock_data[stock_data['Stock_Type'] == stock_type] | |
| features = stock_data[['Close', 'reddit_sentiment', 'twitter_sentiment']].values | |
| # Scale features | |
| scaler = MinMaxScaler(feature_range=(0, 1)) | |
| features_scaled = scaler.fit_transform(features) | |
| if len(features_scaled) > time_steps: | |
| X, y = create_sequences_multifeature(features_scaled, time_steps) | |
| X = X.reshape(X.shape[0], X.shape[1], features_scaled.shape[1]) | |
| return X, y, scaler | |
| else: | |
| raise ValueError(f"{stock_type}: Insufficient data: {len(features_scaled)} rows available, {time_steps} required.") | |
| # Function to predict and evaluate | |
| def predict_and_evaluate(stock_to_predict, scaler, time_steps=60): | |
| stock_prices = all_stock_data[all_stock_data['Stock_Type'] == stock_to_predict][['Close', 'reddit_sentiment', 'twitter_sentiment']].values | |
| if len(stock_prices) >= time_steps: | |
| # Prepare data for prediction | |
| last_60_features = stock_prices[-time_steps:] | |
| # Scale the features | |
| last_60_scaled = scaler.transform(last_60_features).reshape(1, -1, last_60_features.shape[1]) | |
| # Load the saved model | |
| model = load_model(f'lstm_{stock_to_predict}_model_with_sentiment.h5') | |
| # Predict next day price | |
| predicted_price_scaled = model.predict(last_60_scaled) | |
| # Reshape the prediction to match the number of features | |
| predicted_price_scaled_reshaped = np.concatenate((predicted_price_scaled, np.zeros((predicted_price_scaled.shape[0], 2))), axis=1) | |
| # Inverse transform to get the predicted 'Close' price | |
| predicted_price = scaler.inverse_transform(predicted_price_scaled_reshaped) | |
| # Real data for MSE and other metrics | |
| real_data = all_stock_data[all_stock_data['Stock_Type'] == stock_to_predict].iloc[-1]['Close'] | |
| # Evaluate different metrics | |
| mse = mean_squared_error([real_data], [predicted_price[0][0]]) # Wrap both in lists | |
| mae = mean_absolute_error([real_data], [predicted_price[0][0]]) # Wrap both in lists | |
| rmse = np.sqrt(mse) | |
| # Adjusted R-squared for single prediction | |
| def calculate_r2(real_data, predicted_data): | |
| if len(real_data) > 1: | |
| return r2_score(real_data, predicted_data) | |
| else: | |
| return np.nan # Return NaN if R² cannot be calculated | |
| r2 = calculate_r2([real_data], [predicted_price[0][0]]) | |
| # Plot the predicted vs. actual prices | |
| plt.figure(figsize=(12, 6)) | |
| plt.plot([real_data], label='Real Price', color='blue', marker='o') | |
| plt.plot([predicted_price[0][0]], label='Predicted Price', color='red', marker='x') | |
| plt.title(f"{stock_to_predict} - Predicted vs Actual Price") | |
| plt.legend() | |
| plt.grid(True) | |
| plt.show() | |
| # Plot the performance metrics | |
| metrics = ['MSE', 'MAE', 'RMSE', 'R²'] | |
| values = [mse, mae, rmse, r2] | |
| plt.figure(figsize=(8, 6)) | |
| plt.bar(metrics, values, color=['blue', 'green', 'red', 'purple']) | |
| plt.title(f"{stock_to_predict} - Performance Metrics") | |
| plt.ylabel('Values') | |
| plt.show() | |
| print(f"Predicted Next Day Price for {stock_to_predict}: {predicted_price[0][0]}") | |
| print(f"Mean Squared Error (MSE) for {stock_to_predict}: {mse}") | |
| print(f"Mean Absolute Error (MAE) for {stock_to_predict}: {mae}") | |
| print(f"Root Mean Squared Error (RMSE) for {stock_to_predict}: {rmse}") | |
| print(f"R-squared (R²) for {stock_to_predict}: {r2}") | |
| else: | |
| print(f"Insufficient data for prediction: {len(stock_prices)} rows available, {time_steps} required.") | |
| # Allow user to input which stock to predict | |
| stock_to_predict = input("Enter the stock symbol to predict (e.g., 'AAPL'): ") | |
| try: | |
| # Get the scaler from preprocessing | |
| _, _, scaler = preprocess_for_lstm_multifeature(all_stock_data, stock_to_predict) | |
| # Perform prediction and evaluation | |
| predict_and_evaluate(stock_to_predict, scaler) | |
| except ValueError as e: | |
| print(e) | |