newstock / model_utils.py
roshcheeku's picture
Create model_utils.py
1b22cce verified
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import load_model
#─── Data loading ──────────────────────────────────────────────────────────────
def load_and_merge_data():
# Load stock data
stock = pd.read_csv('scraped_combined_stock_data.csv')
stock['Date'] = pd.to_datetime(stock['Date'])
stock['Close'] = pd.to_numeric(stock['Close'], errors='coerce')
stock = stock.dropna(subset=['Close'])
# Load Reddit sentiment
reddit = pd.read_csv('cleaned_reddit_data.csv')
reddit['Date'] = pd.to_datetime(reddit['Date'])
reddit = (reddit
.groupby('Date')['sentiment']
.mean()
.reset_index()
.rename(columns={'sentiment': 'reddit_sentiment'}))
# Load Twitter sentiment
twitter = pd.read_csv('cleaned_twitter_data.csv')
twitter['Date'] = pd.to_datetime(twitter['Date'])
twitter = (twitter
.groupby('Date')['sentiment']
.mean()
.reset_index()
.rename(columns={'sentiment': 'twitter_sentiment'}))
# Drop tz info if present
for df in (stock, reddit, twitter):
df['Date'] = df['Date'].dt.tz_localize(None)
# Merge all
df = stock.merge(reddit, on='Date', how='left') \
.merge(twitter, on='Date', how='left')
df['reddit_sentiment'] = df['reddit_sentiment'].fillna(0)
df['twitter_sentiment'] = df['twitter_sentiment'].fillna(0)
return df
#─── Sequence creation ─────────────────────────────────────────────────────────
def create_sequences_multifeature(data: np.ndarray, time_steps: int = 60):
X, y = [], []
for i in range(len(data) - time_steps):
X.append(data[i:i + time_steps])
y.append(data[i + time_steps, 0]) # Close price as target
return np.array(X), np.array(y)
#─── Preprocessing for a single ticker ─────────────────────────────────────────
def prepare_input(df, ticker: str, time_steps: int = 60):
"""
Returns:
last_sequence: np.array shaped (1, time_steps, 3)
scaler: MinMaxScaler fitted on that ticker’s data
error: None or error message
"""
sub = df[df['Stock_Type'] == ticker]
if len(sub) < time_steps:
return None, None, f"Not enough data for {ticker}: {len(sub)} rows"
features = sub[['Close', 'reddit_sentiment', 'twitter_sentiment']].values
scaler = MinMaxScaler((0,1))
scaled = scaler.fit_transform(features)
# Grab last `time_steps`
last_seq = scaled[-time_steps:].reshape(1, time_steps, scaled.shape[1])
return last_seq, scaler, None
#─── Model loading + prediction ────────────────────────────────────────────────
def get_model_path(ticker: str, with_sentiment: bool = True):
ext = 'with_sentiment' if with_sentiment else 'model'
# you can adjust naming here if your files differ
return f"lstm_{ticker}_{'model_with_sentiment' if with_sentiment else 'model'}.h5"
def predict_price(ticker: str, df, time_steps: int = 60):
"""
Loads the model for `ticker`, preprocesses the last `time_steps` of data,
runs a single-step forecast, and returns the denormalized price.
"""
model_file = get_model_path(ticker, with_sentiment=True)
if not os.path.exists(model_file):
raise FileNotFoundError(f"Model file '{model_file}' not found.")
seq, scaler, err = prepare_input(df, ticker, time_steps)
if err:
raise ValueError(err)
model = load_model(model_file)
pred_scaled = model.predict(seq) # shape (1,1)
# pad with zeros for the two sentiment dims so scaler inverse works
padded = np.concatenate([pred_scaled, np.zeros((1,2))], axis=1)
pred = scaler.inverse_transform(padded)[0,0]
return float(pred)