Mayur-cinderace's picture
Add DVC tracking for data and models
d582951
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import mlflow
import mlflow.sklearn
import mlflow.pytorch
import joblib
import os
mlflow.set_tracking_uri("./mlruns")
mlflow.set_experiment("Investor-Sentiment-Aware-Models")
def load_stock_data():
df_raw = pd.read_csv(r'data\stock_prices.csv', skiprows=1, header=0, na_values=[''])
df_raw = df_raw.rename(columns={df_raw.columns[0]: 'Date'})
first_row = pd.read_csv(r'data\stock_prices.csv', nrows=1, header=None).iloc[0]
metric_names = first_row[1:].tolist()
rename_dict = {col: metric for col, metric in zip(df_raw.columns[1:], metric_names)}
df_raw = df_raw.rename(columns=rename_dict)
df_long = pd.melt(df_raw, id_vars=['Date'], value_vars=df_raw.columns[1:], var_name='Metric', value_name='Value')
df_long['Ticker'] = np.tile(['AAPL', 'GOOGL', 'TSLA'] * 5, len(df_raw))
df_prices = df_long.pivot_table(index=['Date', 'Ticker'], columns='Metric', values='Value', aggfunc='first').reset_index()
df_prices['Date'] = pd.to_datetime(df_prices['Date'], errors='coerce')
numeric_cols = ['Close', 'High', 'Low', 'Open', 'Volume']
df_prices[numeric_cols] = df_prices[numeric_cols].astype(float)
df_prices['Return'] = df_prices.groupby('Ticker')['Close'].pct_change()
return df_prices
def load_text_data():
df_reddit = pd.read_csv(r'data\reddit_data.csv')
df_news = pd.read_csv(r'data\news_articles.csv')
df_gnews = pd.read_csv(r'data\gnews_data.csv')
for df, src in [(df_reddit, 'reddit'), (df_news, 'news'), (df_gnews, 'gnews')]:
df.rename(columns={'content': 'text'}, inplace=True)
df['source'] = src
df = df[['text', 'publishedAt', 'source']]
df_text = pd.concat([df_reddit, df_news, df_gnews], ignore_index=True)
df_text['text'] = df_text['text'].astype(str).str.lower()
df_text['text'] = df_text['text'].str.replace(r'http\S+|www\S+', '', regex=True)
df_text['text'] = df_text['text'].str.replace(r'[^a-zA-Z\s]', ' ', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()
df_text['date'] = pd.to_datetime(df_text['publishedAt'], errors='coerce').dt.date
df_text = df_text.dropna(subset=['date'])
return df_text
df_prices = load_stock_data()
df_text = load_text_data()
positive_words = ['good', 'buy', 'up', 'rise', 'gain', 'positive', 'bull', 'strong', 'profit', 'growth', 'high', 'best', 'win', 'success', 'pump', 'moon', 'rocket']
negative_words = ['bad', 'sell', 'down', 'fall', 'loss', 'negative', 'bear', 'weak', 'decline', 'low', 'worst', 'fail', 'crash', 'risk', 'dump', 'scam']
def simple_sentiment(text):
words = text.split()
pos_count = sum(1 for word in words if word in positive_words)
neg_count = sum(1 for word in words if word in negative_words)
total = pos_count + neg_count
if total == 0:
return 0
return (pos_count - neg_count) / total
df_text['sentiment'] = df_text['text'].apply(simple_sentiment)
daily_sent = df_text.groupby(['date', 'source'])['sentiment'].mean().reset_index()
daily_sent_total = daily_sent.groupby('date')['sentiment'].mean().reset_index()
df_prices['date'] = df_prices['Date'].dt.date
daily_sent_total['date'] = pd.to_datetime(daily_sent_total['date']).dt.date
df_merged = df_prices.merge(daily_sent_total, on='date', how='left')
df_merged['sentiment'] = df_merged['sentiment'].ffill().fillna(0)
df_merged = df_merged.sort_values(['Ticker', 'Date']).reset_index(drop=True)
df_merged['sentiment_lag1'] = df_merged.groupby('Ticker')['sentiment'].shift(1).bfill().fillna(0)
ticker = 'GOOGL'
df_ticker = df_merged[df_merged['Ticker'] == ticker].copy()
df_ticker = df_ticker.sort_values('Date')
df_ticker['return_lag1'] = df_ticker['Return'].shift(1)
df_ticker['volume_lag1'] = df_ticker['Volume'].shift(1)
df_ticker.dropna(inplace=True)
df_ticker['target_return'] = df_ticker['Return'].shift(-1)
df_ticker.dropna(inplace=True)
features = ['return_lag1', 'volume_lag1', 'sentiment_lag1']
X = df_ticker[features].values
y = df_ticker['target_return'].values
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1,1)).flatten()
train_size = int(len(X) * 0.8)
X_train, X_test = X_scaled[:train_size], X_scaled[train_size:]
y_train, y_test = y_scaled[:train_size], y_scaled[train_size:]
class TimeSeriesDataset(Dataset):
def __init__(self, X, y):
self.X = torch.tensor(X, dtype=torch.float32)
self.y = torch.tensor(y, dtype=torch.float32)
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
def train_model(model, loader, epochs=50):
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(epochs):
model.train()
for batch_x, batch_y in loader:
optimizer.zero_grad()
outputs = model(batch_x)
loss = criterion(outputs, batch_y.unsqueeze(1))
loss.backward()
optimizer.step()
def predict_model(model, loader):
model.eval()
preds = []
with torch.no_grad():
for batch_x, _ in loader:
outputs = model(batch_x)
preds.extend(outputs.squeeze().numpy())
return np.array(preds)
input_size = X.shape[1]
class MLPModel(nn.Module):
def __init__(self, input_size):
super().__init__()
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, 25)
self.fc3 = nn.Linear(25, 1)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.fc3(x)
return x
mlp_model = MLPModel(input_size)
def create_sequences(data_X, data_y, seq_length):
xs, ys = [], []
for i in range(len(data_X) - seq_length):
x = data_X[i:i+seq_length]
y = data_y[i+seq_length]
xs.append(x)
ys.append(y)
return np.array(xs), np.array(ys)
seq_length = 10
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)
train_size_seq = int(len(X_seq) * 0.8)
X_train_seq, X_test_seq = X_seq[:train_size_seq], X_seq[train_size_seq:]
y_train_seq, y_test_seq = y_seq[:train_size_seq], y_seq[train_size_seq:]
class SeqDataset(Dataset):
def __init__(self, X, y):
self.X = torch.tensor(X, dtype=torch.float32)
self.y = torch.tensor(y, dtype=torch.float32)
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
train_seq_dataset = SeqDataset(X_train_seq, y_train_seq)
test_seq_dataset = SeqDataset(X_test_seq, y_test_seq)
train_seq_loader = DataLoader(train_seq_dataset, batch_size=32, shuffle=False)
test_seq_loader = DataLoader(test_seq_dataset, batch_size=32, shuffle=False)
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, 1)
def forward(self, x):
out, _ = self.lstm(x)
out = self.fc(out[:, -1, :])
return out
hidden_size = 50
num_layers = 2
def run_models_for_ticker(ticker, seq_length=10):
with mlflow.start_run(run_name=f"{ticker}_models"):
mlflow.log_param("ticker", ticker)
mlflow.log_param("seq_length", seq_length)
mlflow.log_param("test_split", 0.2)
df_t = df_merged[df_merged['Ticker'] == ticker].copy()
df_t = df_t.sort_values('Date')
df_t['return_lag1'] = df_t['Return'].shift(1)
df_t['volume_lag1'] = df_t['Volume'].shift(1)
df_t = df_t.dropna()
df_t['target_return'] = df_t['Return'].shift(-1)
df_t = df_t.dropna()
features_t = ['return_lag1', 'volume_lag1', 'sentiment_lag1']
X_t = df_t[features_t].values
y_t = df_t['target_return'].values
scaler_X_t = MinMaxScaler()
scaler_y_t = MinMaxScaler()
Xs = scaler_X_t.fit_transform(X_t)
ys = scaler_y_t.fit_transform(y_t.reshape(-1, 1)).flatten()
train_size_t = int(len(Xs) * 0.8)
X_train_t, X_test_t = Xs[:train_size_t], Xs[train_size_t:]
y_train_t, y_test_t = ys[:train_size_t], ys[train_size_t:]
if len(X_train_t) == 0 or len(X_test_t) == 0:
print(f"Not enough data for ticker {ticker} after splitting (train={len(X_train_t)}, test={len(X_test_t)}). Skipping.")
return None
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train_t, y_train_t)
y_rf_scaled = rf.predict(X_test_t)
y_rf = scaler_y_t.inverse_transform(y_rf_scaled.reshape(-1, 1)).flatten()
mse_rf = np.mean((y_test_t - y_rf_scaled)**2)
mae_rf = np.mean(np.abs(y_test_t - y_rf_scaled))
with mlflow.start_run(run_name=f"{ticker}_RandomForest", nested=True):
mlflow.log_param("model_type", "RandomForest")
mlflow.log_param("n_estimators", 200)
mlflow.log_metric("mse", mse_rf)
mlflow.log_metric("mae", mae_rf)
mlflow.sklearn.log_model(rf, artifact_path=f"{ticker}_rf")
print(f"{ticker} - RandomForest MSE: {mse_rf:.6f}, MAE: {mae_rf:.6f}")
mlflow.log_metric(f"{ticker}_rf_mse", mse_rf)
mlflow.log_metric(f"{ticker}_rf_mae", mae_rf)
train_ds = TimeSeriesDataset(X_train_t, y_train_t)
test_ds = TimeSeriesDataset(X_test_t, y_test_t)
train_loader_t = DataLoader(train_ds, batch_size=32, shuffle=False)
test_loader_t = DataLoader(test_ds, batch_size=32, shuffle=False)
mlp = MLPModel(input_size)
train_model(mlp, train_loader_t)
y_mlp_scaled = predict_model(mlp, test_loader_t)
y_mlp = scaler_y_t.inverse_transform(y_mlp_scaled.reshape(-1, 1)).flatten()
mse_mlp = np.mean((y_test_t - y_mlp_scaled)**2)
mae_mlp = np.mean(np.abs(y_test_t - y_mlp_scaled))
with mlflow.start_run(run_name=f"{ticker}_MLP", nested=True):
mlflow.log_param("model_type", "MLP")
mlflow.log_param("hidden_sizes", [50, 25])
mlflow.log_metric("mse", mse_mlp)
mlflow.log_metric("mae", mae_mlp)
mlflow.pytorch.log_model(mlp, artifact_path=f"{ticker}_mlp")
print(f"{ticker} - MLP MSE: {mse_mlp:.6f}, MAE: {mae_mlp:.6f}")
mlflow.log_metric(f"{ticker}_mlp_mse", mse_mlp)
mlflow.log_metric(f"{ticker}_mlp_mae", mae_mlp)
X_seq_t, y_seq_t = create_sequences(Xs, ys, seq_length)
if len(X_seq_t) > 0:
train_size_seq_t = int(len(X_seq_t) * 0.8)
X_train_seq_t, X_test_seq_t = X_seq_t[:train_size_seq_t], X_seq_t[train_size_seq_t:]
y_train_seq_t, y_test_seq_t = y_seq_t[:train_size_seq_t], y_seq_t[train_size_seq_t:]
train_seq_ds_t = SeqDataset(X_train_seq_t, y_train_seq_t)
test_seq_ds_t = SeqDataset(X_test_seq_t, y_test_seq_t)
train_seq_loader_t = DataLoader(train_seq_ds_t, batch_size=32, shuffle=False)
test_seq_loader_t = DataLoader(test_seq_ds_t, batch_size=32, shuffle=False)
lstm = LSTMModel(input_size, hidden_size, num_layers)
train_model(lstm, train_seq_loader_t)
y_lstm_scaled = predict_model(lstm, test_seq_loader_t)
y_lstm = scaler_y_t.inverse_transform(y_lstm_scaled.reshape(-1, 1)).flatten()
mse_lstm = np.mean((y_test_seq_t - y_lstm_scaled)**2)
mae_lstm = np.mean(np.abs(y_test_seq_t - y_lstm_scaled))
with mlflow.start_run(run_name=f"{ticker}_LSTM", nested=True):
mlflow.log_param("model_type", "LSTM")
mlflow.log_param("hidden_size", hidden_size)
mlflow.log_param("num_layers", num_layers)
mlflow.log_metric("mse", mse_lstm)
mlflow.log_metric("mae", mae_lstm)
mlflow.pytorch.log_model(lstm, artifact_path=f"{ticker}_lstm")
print(f"{ticker} - LSTM MSE: {mse_lstm:.6f}, MAE: {mae_lstm:.6f}")
mlflow.log_metric(f"{ticker}_lstm_mse", mse_lstm)
mlflow.log_metric(f"{ticker}_lstm_mae", mae_lstm)
else:
y_lstm = np.array([])
dates_test = df_t['Date'].iloc[train_size_t:train_size_t + len(y_test_t)].values
dates_test = pd.to_datetime(dates_test)
y_actual = scaler_y_t.inverse_transform(y_test_t.reshape(-1, 1)).flatten()
return dates_test, y_actual, y_rf, y_mlp, y_lstm
tickers = ['AAPL', 'GOOGL', 'TSLA']
for t in tickers:
try:
res = run_models_for_ticker(t)
except Exception as e:
print(f"Error processing {t}: {e}")
continue
if res is None:
continue
dates_test, y_actual, y_rf, y_mlp, y_lstm = res
plt.figure(figsize=(14, 6))
plt.plot(dates_test, y_actual, label='Actual', color='black')
if len(y_rf) > 0:
plt.plot(dates_test, y_rf, label='RandomForest', color='blue')
if len(y_mlp) > 0:
plt.plot(dates_test, y_mlp, label='MLP', color='green')
if len(y_lstm) > 0:
plt.plot(dates_test[:len(y_lstm)], y_lstm, label='LSTM', color='red')
plt.title(f"{t} Next Day Return Predictions (Sentiment-Aware)")
plt.xlabel('Date')
plt.ylabel('Return')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
out_file = f'model_predictions_{t}.png'
plt.savefig(out_file)
print(f"Saved plot for {t} to {out_file}")
plt.show()