import pandas as pd import numpy as np from sklearn.preprocessing import MinMaxScaler from sklearn.ensemble import RandomForestRegressor import matplotlib.pyplot as plt import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader import mlflow import mlflow.sklearn import mlflow.pytorch import joblib import os mlflow.set_tracking_uri("./mlruns") mlflow.set_experiment("Investor-Sentiment-Aware-Models") def load_stock_data(): df_raw = pd.read_csv(r'data\stock_prices.csv', skiprows=1, header=0, na_values=['']) df_raw = df_raw.rename(columns={df_raw.columns[0]: 'Date'}) first_row = pd.read_csv(r'data\stock_prices.csv', nrows=1, header=None).iloc[0] metric_names = first_row[1:].tolist() rename_dict = {col: metric for col, metric in zip(df_raw.columns[1:], metric_names)} df_raw = df_raw.rename(columns=rename_dict) df_long = pd.melt(df_raw, id_vars=['Date'], value_vars=df_raw.columns[1:], var_name='Metric', value_name='Value') df_long['Ticker'] = np.tile(['AAPL', 'GOOGL', 'TSLA'] * 5, len(df_raw)) df_prices = df_long.pivot_table(index=['Date', 'Ticker'], columns='Metric', values='Value', aggfunc='first').reset_index() df_prices['Date'] = pd.to_datetime(df_prices['Date'], errors='coerce') numeric_cols = ['Close', 'High', 'Low', 'Open', 'Volume'] df_prices[numeric_cols] = df_prices[numeric_cols].astype(float) df_prices['Return'] = df_prices.groupby('Ticker')['Close'].pct_change() return df_prices def load_text_data(): df_reddit = pd.read_csv(r'data\reddit_data.csv') df_news = pd.read_csv(r'data\news_articles.csv') df_gnews = pd.read_csv(r'data\gnews_data.csv') for df, src in [(df_reddit, 'reddit'), (df_news, 'news'), (df_gnews, 'gnews')]: df.rename(columns={'content': 'text'}, inplace=True) df['source'] = src df = df[['text', 'publishedAt', 'source']] df_text = pd.concat([df_reddit, df_news, df_gnews], ignore_index=True) df_text['text'] = df_text['text'].astype(str).str.lower() df_text['text'] = df_text['text'].str.replace(r'http\S+|www\S+', '', regex=True) df_text['text'] = df_text['text'].str.replace(r'[^a-zA-Z\s]', ' ', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip() df_text['date'] = pd.to_datetime(df_text['publishedAt'], errors='coerce').dt.date df_text = df_text.dropna(subset=['date']) return df_text df_prices = load_stock_data() df_text = load_text_data() positive_words = ['good', 'buy', 'up', 'rise', 'gain', 'positive', 'bull', 'strong', 'profit', 'growth', 'high', 'best', 'win', 'success', 'pump', 'moon', 'rocket'] negative_words = ['bad', 'sell', 'down', 'fall', 'loss', 'negative', 'bear', 'weak', 'decline', 'low', 'worst', 'fail', 'crash', 'risk', 'dump', 'scam'] def simple_sentiment(text): words = text.split() pos_count = sum(1 for word in words if word in positive_words) neg_count = sum(1 for word in words if word in negative_words) total = pos_count + neg_count if total == 0: return 0 return (pos_count - neg_count) / total df_text['sentiment'] = df_text['text'].apply(simple_sentiment) daily_sent = df_text.groupby(['date', 'source'])['sentiment'].mean().reset_index() daily_sent_total = daily_sent.groupby('date')['sentiment'].mean().reset_index() df_prices['date'] = df_prices['Date'].dt.date daily_sent_total['date'] = pd.to_datetime(daily_sent_total['date']).dt.date df_merged = df_prices.merge(daily_sent_total, on='date', how='left') df_merged['sentiment'] = df_merged['sentiment'].ffill().fillna(0) df_merged = df_merged.sort_values(['Ticker', 'Date']).reset_index(drop=True) df_merged['sentiment_lag1'] = df_merged.groupby('Ticker')['sentiment'].shift(1).bfill().fillna(0) ticker = 'GOOGL' df_ticker = df_merged[df_merged['Ticker'] == ticker].copy() df_ticker = df_ticker.sort_values('Date') df_ticker['return_lag1'] = df_ticker['Return'].shift(1) df_ticker['volume_lag1'] = df_ticker['Volume'].shift(1) df_ticker.dropna(inplace=True) df_ticker['target_return'] = df_ticker['Return'].shift(-1) df_ticker.dropna(inplace=True) features = ['return_lag1', 'volume_lag1', 'sentiment_lag1'] X = df_ticker[features].values y = df_ticker['target_return'].values scaler_X = MinMaxScaler() scaler_y = MinMaxScaler() X_scaled = scaler_X.fit_transform(X) y_scaled = scaler_y.fit_transform(y.reshape(-1,1)).flatten() train_size = int(len(X) * 0.8) X_train, X_test = X_scaled[:train_size], X_scaled[train_size:] y_train, y_test = y_scaled[:train_size], y_scaled[train_size:] class TimeSeriesDataset(Dataset): def __init__(self, X, y): self.X = torch.tensor(X, dtype=torch.float32) self.y = torch.tensor(y, dtype=torch.float32) def __len__(self): return len(self.X) def __getitem__(self, idx): return self.X[idx], self.y[idx] train_dataset = TimeSeriesDataset(X_train, y_train) test_dataset = TimeSeriesDataset(X_test, y_test) train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) def train_model(model, loader, epochs=50): criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=0.001) for epoch in range(epochs): model.train() for batch_x, batch_y in loader: optimizer.zero_grad() outputs = model(batch_x) loss = criterion(outputs, batch_y.unsqueeze(1)) loss.backward() optimizer.step() def predict_model(model, loader): model.eval() preds = [] with torch.no_grad(): for batch_x, _ in loader: outputs = model(batch_x) preds.extend(outputs.squeeze().numpy()) return np.array(preds) input_size = X.shape[1] class MLPModel(nn.Module): def __init__(self, input_size): super().__init__() self.fc1 = nn.Linear(input_size, 50) self.fc2 = nn.Linear(50, 25) self.fc3 = nn.Linear(25, 1) def forward(self, x): x = torch.relu(self.fc1(x)) x = torch.relu(self.fc2(x)) x = self.fc3(x) return x mlp_model = MLPModel(input_size) def create_sequences(data_X, data_y, seq_length): xs, ys = [], [] for i in range(len(data_X) - seq_length): x = data_X[i:i+seq_length] y = data_y[i+seq_length] xs.append(x) ys.append(y) return np.array(xs), np.array(ys) seq_length = 10 X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length) train_size_seq = int(len(X_seq) * 0.8) X_train_seq, X_test_seq = X_seq[:train_size_seq], X_seq[train_size_seq:] y_train_seq, y_test_seq = y_seq[:train_size_seq], y_seq[train_size_seq:] class SeqDataset(Dataset): def __init__(self, X, y): self.X = torch.tensor(X, dtype=torch.float32) self.y = torch.tensor(y, dtype=torch.float32) def __len__(self): return len(self.X) def __getitem__(self, idx): return self.X[idx], self.y[idx] train_seq_dataset = SeqDataset(X_train_seq, y_train_seq) test_seq_dataset = SeqDataset(X_test_seq, y_test_seq) train_seq_loader = DataLoader(train_seq_dataset, batch_size=32, shuffle=False) test_seq_loader = DataLoader(test_seq_dataset, batch_size=32, shuffle=False) class LSTMModel(nn.Module): def __init__(self, input_size, hidden_size, num_layers): super().__init__() self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True) self.fc = nn.Linear(hidden_size, 1) def forward(self, x): out, _ = self.lstm(x) out = self.fc(out[:, -1, :]) return out hidden_size = 50 num_layers = 2 def run_models_for_ticker(ticker, seq_length=10): with mlflow.start_run(run_name=f"{ticker}_models"): mlflow.log_param("ticker", ticker) mlflow.log_param("seq_length", seq_length) mlflow.log_param("test_split", 0.2) df_t = df_merged[df_merged['Ticker'] == ticker].copy() df_t = df_t.sort_values('Date') df_t['return_lag1'] = df_t['Return'].shift(1) df_t['volume_lag1'] = df_t['Volume'].shift(1) df_t = df_t.dropna() df_t['target_return'] = df_t['Return'].shift(-1) df_t = df_t.dropna() features_t = ['return_lag1', 'volume_lag1', 'sentiment_lag1'] X_t = df_t[features_t].values y_t = df_t['target_return'].values scaler_X_t = MinMaxScaler() scaler_y_t = MinMaxScaler() Xs = scaler_X_t.fit_transform(X_t) ys = scaler_y_t.fit_transform(y_t.reshape(-1, 1)).flatten() train_size_t = int(len(Xs) * 0.8) X_train_t, X_test_t = Xs[:train_size_t], Xs[train_size_t:] y_train_t, y_test_t = ys[:train_size_t], ys[train_size_t:] if len(X_train_t) == 0 or len(X_test_t) == 0: print(f"Not enough data for ticker {ticker} after splitting (train={len(X_train_t)}, test={len(X_test_t)}). Skipping.") return None rf = RandomForestRegressor(n_estimators=200, random_state=42) rf.fit(X_train_t, y_train_t) y_rf_scaled = rf.predict(X_test_t) y_rf = scaler_y_t.inverse_transform(y_rf_scaled.reshape(-1, 1)).flatten() mse_rf = np.mean((y_test_t - y_rf_scaled)**2) mae_rf = np.mean(np.abs(y_test_t - y_rf_scaled)) with mlflow.start_run(run_name=f"{ticker}_RandomForest", nested=True): mlflow.log_param("model_type", "RandomForest") mlflow.log_param("n_estimators", 200) mlflow.log_metric("mse", mse_rf) mlflow.log_metric("mae", mae_rf) mlflow.sklearn.log_model(rf, artifact_path=f"{ticker}_rf") print(f"{ticker} - RandomForest MSE: {mse_rf:.6f}, MAE: {mae_rf:.6f}") mlflow.log_metric(f"{ticker}_rf_mse", mse_rf) mlflow.log_metric(f"{ticker}_rf_mae", mae_rf) train_ds = TimeSeriesDataset(X_train_t, y_train_t) test_ds = TimeSeriesDataset(X_test_t, y_test_t) train_loader_t = DataLoader(train_ds, batch_size=32, shuffle=False) test_loader_t = DataLoader(test_ds, batch_size=32, shuffle=False) mlp = MLPModel(input_size) train_model(mlp, train_loader_t) y_mlp_scaled = predict_model(mlp, test_loader_t) y_mlp = scaler_y_t.inverse_transform(y_mlp_scaled.reshape(-1, 1)).flatten() mse_mlp = np.mean((y_test_t - y_mlp_scaled)**2) mae_mlp = np.mean(np.abs(y_test_t - y_mlp_scaled)) with mlflow.start_run(run_name=f"{ticker}_MLP", nested=True): mlflow.log_param("model_type", "MLP") mlflow.log_param("hidden_sizes", [50, 25]) mlflow.log_metric("mse", mse_mlp) mlflow.log_metric("mae", mae_mlp) mlflow.pytorch.log_model(mlp, artifact_path=f"{ticker}_mlp") print(f"{ticker} - MLP MSE: {mse_mlp:.6f}, MAE: {mae_mlp:.6f}") mlflow.log_metric(f"{ticker}_mlp_mse", mse_mlp) mlflow.log_metric(f"{ticker}_mlp_mae", mae_mlp) X_seq_t, y_seq_t = create_sequences(Xs, ys, seq_length) if len(X_seq_t) > 0: train_size_seq_t = int(len(X_seq_t) * 0.8) X_train_seq_t, X_test_seq_t = X_seq_t[:train_size_seq_t], X_seq_t[train_size_seq_t:] y_train_seq_t, y_test_seq_t = y_seq_t[:train_size_seq_t], y_seq_t[train_size_seq_t:] train_seq_ds_t = SeqDataset(X_train_seq_t, y_train_seq_t) test_seq_ds_t = SeqDataset(X_test_seq_t, y_test_seq_t) train_seq_loader_t = DataLoader(train_seq_ds_t, batch_size=32, shuffle=False) test_seq_loader_t = DataLoader(test_seq_ds_t, batch_size=32, shuffle=False) lstm = LSTMModel(input_size, hidden_size, num_layers) train_model(lstm, train_seq_loader_t) y_lstm_scaled = predict_model(lstm, test_seq_loader_t) y_lstm = scaler_y_t.inverse_transform(y_lstm_scaled.reshape(-1, 1)).flatten() mse_lstm = np.mean((y_test_seq_t - y_lstm_scaled)**2) mae_lstm = np.mean(np.abs(y_test_seq_t - y_lstm_scaled)) with mlflow.start_run(run_name=f"{ticker}_LSTM", nested=True): mlflow.log_param("model_type", "LSTM") mlflow.log_param("hidden_size", hidden_size) mlflow.log_param("num_layers", num_layers) mlflow.log_metric("mse", mse_lstm) mlflow.log_metric("mae", mae_lstm) mlflow.pytorch.log_model(lstm, artifact_path=f"{ticker}_lstm") print(f"{ticker} - LSTM MSE: {mse_lstm:.6f}, MAE: {mae_lstm:.6f}") mlflow.log_metric(f"{ticker}_lstm_mse", mse_lstm) mlflow.log_metric(f"{ticker}_lstm_mae", mae_lstm) else: y_lstm = np.array([]) dates_test = df_t['Date'].iloc[train_size_t:train_size_t + len(y_test_t)].values dates_test = pd.to_datetime(dates_test) y_actual = scaler_y_t.inverse_transform(y_test_t.reshape(-1, 1)).flatten() return dates_test, y_actual, y_rf, y_mlp, y_lstm tickers = ['AAPL', 'GOOGL', 'TSLA'] for t in tickers: try: res = run_models_for_ticker(t) except Exception as e: print(f"Error processing {t}: {e}") continue if res is None: continue dates_test, y_actual, y_rf, y_mlp, y_lstm = res plt.figure(figsize=(14, 6)) plt.plot(dates_test, y_actual, label='Actual', color='black') if len(y_rf) > 0: plt.plot(dates_test, y_rf, label='RandomForest', color='blue') if len(y_mlp) > 0: plt.plot(dates_test, y_mlp, label='MLP', color='green') if len(y_lstm) > 0: plt.plot(dates_test[:len(y_lstm)], y_lstm, label='LSTM', color='red') plt.title(f"{t} Next Day Return Predictions (Sentiment-Aware)") plt.xlabel('Date') plt.ylabel('Return') plt.legend() plt.xticks(rotation=45) plt.tight_layout() out_file = f'model_predictions_{t}.png' plt.savefig(out_file) print(f"Saved plot for {t} to {out_file}") plt.show()