Spaces:
Running
Running
Upload 2 files
Browse files- src/data_fetcher.py +67 -49
- src/processor.py +49 -23
src/data_fetcher.py
CHANGED
|
@@ -24,64 +24,82 @@ class DataFetcher:
|
|
| 24 |
|
| 25 |
def fetch_market_data(self, days=50):
|
| 26 |
"""
|
| 27 |
-
|
| 28 |
"""
|
| 29 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
-
try:
|
| 32 |
-
# 1. Setup Timestamps (Finnhub needs Unix seconds)
|
| 33 |
-
end_ts = int(time.time())
|
| 34 |
-
start_ts = int((datetime.now() - timedelta(days=days+10)).timestamp())
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
res = self.finnhub_client.stock_candles('SPY', 'D', start_ts, end_ts)
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
|
| 73 |
-
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
| 85 |
|
| 86 |
# 🛡️ STREAMLIT CACHE: Ignores '_self' so it doesn't try to hash the Finnhub client.
|
| 87 |
# ttl=3600 caches the news for 1 hour so repeated button clicks load instantly.
|
|
|
|
| 24 |
|
| 25 |
def fetch_market_data(self, days=50):
|
| 26 |
"""
|
| 27 |
+
Exclusively loads market data from backup to ensure 100% uptime for demo.
|
| 28 |
"""
|
| 29 |
+
print(f"📁 System: API bypassed. Loading localized market data...")
|
| 30 |
+
backup_path = "data/market_data_backup.csv"
|
| 31 |
+
|
| 32 |
+
if not os.path.exists(backup_path):
|
| 33 |
+
print(f"🚨 FATAL: {backup_path} not found!")
|
| 34 |
+
return pd.DataFrame()
|
| 35 |
+
|
| 36 |
+
df = pd.read_csv(backup_path, index_col=0, parse_dates=True)
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
return df.tail(days)
|
| 40 |
+
|
|
|
|
| 41 |
|
| 42 |
+
|
| 43 |
+
# def fetch_market_data(self, days=50):
|
| 44 |
+
# """
|
| 45 |
+
# Fetches market data using Finnhub (SPY as proxy) with a CSV fallback.
|
| 46 |
+
# """
|
| 47 |
+
# print(f"📡 Attempting to fetch last {days} days from Finnhub (using SPY proxy)...")
|
| 48 |
+
|
| 49 |
+
# try:
|
| 50 |
+
# # 1. Setup Timestamps (Finnhub needs Unix seconds)
|
| 51 |
+
# end_ts = int(time.time())
|
| 52 |
+
# start_ts = int((datetime.now() - timedelta(days=days+10)).timestamp())
|
| 53 |
+
|
| 54 |
+
# # 2. Fetch SPY (S&P 500 Proxy)
|
| 55 |
+
# # '1' means daily candles
|
| 56 |
+
# res = self.finnhub_client.stock_candles('SPY', 'D', start_ts, end_ts)
|
| 57 |
+
|
| 58 |
+
# if res.get('s') != 'ok':
|
| 59 |
+
# raise ValueError(f"Finnhub API returned status: {res.get('s')}")
|
| 60 |
+
|
| 61 |
+
# # Convert Finnhub response to DataFrame
|
| 62 |
+
# df = pd.DataFrame({
|
| 63 |
+
# 'Date': pd.to_datetime(res['t'], unit='s'),
|
| 64 |
+
# 'Close': res['c'],
|
| 65 |
+
# 'Open': res['o'],
|
| 66 |
+
# 'High': res['h'],
|
| 67 |
+
# 'Low': res['l'],
|
| 68 |
+
# 'Volume': res['v']
|
| 69 |
+
# }).set_index('Date')
|
| 70 |
+
|
| 71 |
+
# # 3. Handle VIX (Finnhub free tier often blocks ^VIX)
|
| 72 |
+
# # We attempt it, but if it fails, we merge from our backup data
|
| 73 |
+
# try:
|
| 74 |
+
# vix_res = self.finnhub_client.stock_candles('VIX', 'D', start_ts, end_ts)
|
| 75 |
+
# if vix_res.get('s') == 'ok':
|
| 76 |
+
# df['VIX'] = vix_res['c']
|
| 77 |
+
# else:
|
| 78 |
+
# raise Exception("VIX not available on API")
|
| 79 |
+
# except:
|
| 80 |
+
# print("⚠️ VIX not available on Finnhub. Pulling VIX from backup...")
|
| 81 |
+
# backup_df = pd.read_csv("data/market_data_backup.csv", index_col=0, parse_dates=True)
|
| 82 |
+
# # Reindex backup to match the dates we just got from the API
|
| 83 |
+
# df['VIX'] = backup_df['VIX'].reindex(df.index, method='ffill')
|
| 84 |
+
|
| 85 |
+
# # Final cleanup
|
| 86 |
+
# df = df.ffill().dropna()
|
| 87 |
|
| 88 |
+
# if df.empty:
|
| 89 |
+
# raise ValueError("Resulting DataFrame is empty.")
|
| 90 |
|
| 91 |
+
# return df
|
| 92 |
|
| 93 |
+
# except Exception as e:
|
| 94 |
+
# print(f"⚠️ Finnhub fetch failed ({e}). Loading full backup from data/ folder...")
|
| 95 |
+
# backup_path = "data/market_data_backup.csv"
|
| 96 |
|
| 97 |
+
# if not os.path.exists(backup_path):
|
| 98 |
+
# print(f"🚨 FATAL: {backup_path} not found!")
|
| 99 |
+
# return pd.DataFrame() # This will trigger your safety check in Processor
|
| 100 |
|
| 101 |
+
# df_backup = pd.read_csv(backup_path, index_col=0, parse_dates=True)
|
| 102 |
+
# return df_backup.tail(days)
|
| 103 |
|
| 104 |
# 🛡️ STREAMLIT CACHE: Ignores '_self' so it doesn't try to hash the Finnhub client.
|
| 105 |
# ttl=3600 caches the news for 1 hour so repeated button clicks load instantly.
|
src/processor.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
import torch
|
| 4 |
-
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
|
| 5 |
import joblib
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
class Processor:
|
| 8 |
def __init__(self, scaler_path="models/robust_scaler.pkl"):
|
|
@@ -10,14 +13,14 @@ class Processor:
|
|
| 10 |
self.device = 0 if torch.cuda.is_available() else -1
|
| 11 |
self.model_name = "ProsusAI/finbert"
|
| 12 |
|
| 13 |
-
# Load Scaler
|
| 14 |
try:
|
| 15 |
self.scaler = joblib.load(scaler_path)
|
| 16 |
print(f"✅ Scaler loaded from {scaler_path}")
|
| 17 |
except:
|
| 18 |
-
print("⚠️ Scaler not found
|
| 19 |
-
|
| 20 |
-
# Initialize FinBERT
|
| 21 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 22 |
self.model = AutoModelForSequenceClassification.from_pretrained(
|
| 23 |
self.model_name, use_safetensors=True
|
|
@@ -29,35 +32,57 @@ class Processor:
|
|
| 29 |
device=self.device
|
| 30 |
)
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def process(self, df_market, df_news):
|
| 33 |
"""
|
| 34 |
-
|
| 35 |
"""
|
| 36 |
-
# 1. Process Sentiment
|
| 37 |
df_sent, df_news_scored = self._generate_sentiment_profile(df_news)
|
| 38 |
|
| 39 |
-
# 2. Merge
|
| 40 |
df_features = self._engineer_14_features(df_market, df_sent)
|
| 41 |
|
| 42 |
-
# 3. Extract metadata for
|
| 43 |
latest_metrics = {
|
| 44 |
"Sent_Mean": df_features['Sent_Mean'].iloc[-1],
|
| 45 |
-
"News_Volume": np.exp(df_features['News_Volume'].iloc[-1]) - 1,
|
| 46 |
"Panic_Interaction": df_features['Sent_x_VIX'].iloc[-1],
|
| 47 |
"RSI": df_features['RSI'].iloc[-1] * 100
|
| 48 |
}
|
| 49 |
|
| 50 |
-
# 4.
|
| 51 |
final_window = df_features.tail(30).values
|
| 52 |
scaled_window = self.scaler.transform(final_window)
|
| 53 |
input_tensor = np.expand_dims(scaled_window, axis=0).astype('float32')
|
| 54 |
|
| 55 |
-
# FIX: We now return df_features so app.py can plot the historical 30-day sentiment
|
| 56 |
return input_tensor, latest_metrics, df_features, df_news_scored
|
| 57 |
|
| 58 |
def _generate_sentiment_profile(self, df_news):
|
| 59 |
print("🧠 Running FinBERT Batch Analysis...")
|
| 60 |
titles = df_news['Title'].astype(str).tolist()
|
|
|
|
|
|
|
| 61 |
results = self.sentiment_pipe(titles, batch_size=32, truncation=True)
|
| 62 |
|
| 63 |
scores = []
|
|
@@ -65,9 +90,7 @@ class Processor:
|
|
| 65 |
label, score = res['label'].lower(), res['score']
|
| 66 |
scores.append(score if label == 'positive' else -score if label == 'negative' else 0.0)
|
| 67 |
|
| 68 |
-
df_news['Score'] = scores
|
| 69 |
-
|
| 70 |
-
# Ensure dates match format for grouping
|
| 71 |
df_news['Date'] = pd.to_datetime(df_news['Date']).dt.date
|
| 72 |
grouped = df_news.groupby('Date')['Score']
|
| 73 |
|
|
@@ -78,14 +101,15 @@ class Processor:
|
|
| 78 |
'Net_Bull': grouped.apply(lambda x: x.sum() / (len(x) + 1))
|
| 79 |
}).fillna(0.0)
|
| 80 |
|
| 81 |
-
# Convert index back to datetime for merging
|
| 82 |
daily.index = pd.to_datetime(daily.index)
|
| 83 |
-
|
| 84 |
return daily, df_news
|
| 85 |
|
| 86 |
def _engineer_14_features(self, df, df_sent):
|
| 87 |
data = df.copy()
|
| 88 |
|
|
|
|
|
|
|
|
|
|
| 89 |
# --- QUANT BRANCH (7 Features) ---
|
| 90 |
tp = (data['High'] + data['Low'] + data['Close']) / 3
|
| 91 |
vwap = (tp * data['Volume']).rolling(20).sum() / (data['Volume'].rolling(20).sum() + 1e-9)
|
|
@@ -96,25 +120,27 @@ class Processor:
|
|
| 96 |
loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
|
| 97 |
data['RSI'] = (100 - (100 / (1 + (gain/(loss + 1e-9))))) / 100.0
|
| 98 |
|
| 99 |
-
ema_12
|
| 100 |
-
data['
|
|
|
|
|
|
|
| 101 |
|
| 102 |
data['VIX_Norm'] = data['VIX'] / 100.0
|
| 103 |
data['VIX_Change'] = data['VIX'].pct_change()
|
| 104 |
|
| 105 |
-
tr = pd.concat([data['High']-data['Low'],
|
| 106 |
-
|
|
|
|
| 107 |
data['ATR_Dist'] = np.tanh((data['Close'] - data['Close'].rolling(22).mean()) / (tr.rolling(14).mean() + 1e-9))
|
| 108 |
data['Realized_Vol'] = data['Close'].pct_change().rolling(10).std() * 10
|
| 109 |
|
| 110 |
# --- SENTIMENT BRANCH (7 Features) ---
|
| 111 |
-
# Ensure indices match for joining
|
| 112 |
data.index = pd.to_datetime(data.index)
|
| 113 |
data = data.join(df_sent, how='left').fillna(0.0)
|
| 114 |
|
| 115 |
data['Sent_Mean_Delta'] = data['Sent_Mean'].diff().fillna(0.0)
|
| 116 |
data['Sent_Mean_EMA'] = data['Sent_Mean'].ewm(span=3).mean()
|
| 117 |
-
data['Sent_x_VIX'] = data['Sent_Mean'] * data['VIX_Norm']
|
| 118 |
|
| 119 |
feature_cols = [
|
| 120 |
'VWAP_Dist', 'RSI', 'MACD_Hist', 'VIX_Norm', 'VIX_Change', 'ATR_Dist', 'Realized_Vol',
|
|
|
|
| 1 |
+
import os
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
import torch
|
|
|
|
| 5 |
import joblib
|
| 6 |
+
import time
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
|
| 9 |
|
| 10 |
class Processor:
|
| 11 |
def __init__(self, scaler_path="models/robust_scaler.pkl"):
|
|
|
|
| 13 |
self.device = 0 if torch.cuda.is_available() else -1
|
| 14 |
self.model_name = "ProsusAI/finbert"
|
| 15 |
|
| 16 |
+
# 1. Load Scaler
|
| 17 |
try:
|
| 18 |
self.scaler = joblib.load(scaler_path)
|
| 19 |
print(f"✅ Scaler loaded from {scaler_path}")
|
| 20 |
except:
|
| 21 |
+
print("⚠️ Scaler not found in models/ folder.")
|
| 22 |
+
|
| 23 |
+
# 2. Initialize FinBERT
|
| 24 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 25 |
self.model = AutoModelForSequenceClassification.from_pretrained(
|
| 26 |
self.model_name, use_safetensors=True
|
|
|
|
| 32 |
device=self.device
|
| 33 |
)
|
| 34 |
|
| 35 |
+
def fetch_market_data(self, days=60):
|
| 36 |
+
"""
|
| 37 |
+
Loads market data from your provided CSV backup.
|
| 38 |
+
Bypasses Finnhub to avoid 403 errors during presentation.
|
| 39 |
+
"""
|
| 40 |
+
print(f"📁 System: Bypassing API. Loading local market data...")
|
| 41 |
+
backup_path = "data/market_data_backup.csv"
|
| 42 |
+
|
| 43 |
+
if not os.path.exists(backup_path):
|
| 44 |
+
print(f"🚨 FATAL: {backup_path} not found!")
|
| 45 |
+
return pd.DataFrame()
|
| 46 |
+
|
| 47 |
+
df = pd.read_csv(backup_path, index_col=0, parse_dates=True)
|
| 48 |
+
|
| 49 |
+
# Optional: Sync dates to today for presentation realism
|
| 50 |
+
# last_date = df.index[-1]
|
| 51 |
+
# offset = pd.Timestamp(datetime.now().date()) - last_date
|
| 52 |
+
# df.index = df.index + offset
|
| 53 |
+
|
| 54 |
+
return df.tail(days)
|
| 55 |
+
|
| 56 |
def process(self, df_market, df_news):
|
| 57 |
"""
|
| 58 |
+
Main pipeline: News Sentiment -> Feature Engineering -> GRU Input
|
| 59 |
"""
|
| 60 |
+
# 1. Process Sentiment from headlines
|
| 61 |
df_sent, df_news_scored = self._generate_sentiment_profile(df_news)
|
| 62 |
|
| 63 |
+
# 2. Merge and engineer all 14 features
|
| 64 |
df_features = self._engineer_14_features(df_market, df_sent)
|
| 65 |
|
| 66 |
+
# 3. Extract metadata for Streamlit UI
|
| 67 |
latest_metrics = {
|
| 68 |
"Sent_Mean": df_features['Sent_Mean'].iloc[-1],
|
| 69 |
+
"News_Volume": int(np.exp(df_features['News_Volume'].iloc[-1]) - 1),
|
| 70 |
"Panic_Interaction": df_features['Sent_x_VIX'].iloc[-1],
|
| 71 |
"RSI": df_features['RSI'].iloc[-1] * 100
|
| 72 |
}
|
| 73 |
|
| 74 |
+
# 4. Prepare 30-day window for GRU
|
| 75 |
final_window = df_features.tail(30).values
|
| 76 |
scaled_window = self.scaler.transform(final_window)
|
| 77 |
input_tensor = np.expand_dims(scaled_window, axis=0).astype('float32')
|
| 78 |
|
|
|
|
| 79 |
return input_tensor, latest_metrics, df_features, df_news_scored
|
| 80 |
|
| 81 |
def _generate_sentiment_profile(self, df_news):
|
| 82 |
print("🧠 Running FinBERT Batch Analysis...")
|
| 83 |
titles = df_news['Title'].astype(str).tolist()
|
| 84 |
+
|
| 85 |
+
# Batch processing to handle 1700+ headlines efficiently
|
| 86 |
results = self.sentiment_pipe(titles, batch_size=32, truncation=True)
|
| 87 |
|
| 88 |
scores = []
|
|
|
|
| 90 |
label, score = res['label'].lower(), res['score']
|
| 91 |
scores.append(score if label == 'positive' else -score if label == 'negative' else 0.0)
|
| 92 |
|
| 93 |
+
df_news['Score'] = scores
|
|
|
|
|
|
|
| 94 |
df_news['Date'] = pd.to_datetime(df_news['Date']).dt.date
|
| 95 |
grouped = df_news.groupby('Date')['Score']
|
| 96 |
|
|
|
|
| 101 |
'Net_Bull': grouped.apply(lambda x: x.sum() / (len(x) + 1))
|
| 102 |
}).fillna(0.0)
|
| 103 |
|
|
|
|
| 104 |
daily.index = pd.to_datetime(daily.index)
|
|
|
|
| 105 |
return daily, df_news
|
| 106 |
|
| 107 |
def _engineer_14_features(self, df, df_sent):
|
| 108 |
data = df.copy()
|
| 109 |
|
| 110 |
+
data.columns = [c.capitalize() for c in data.columns]
|
| 111 |
+
if 'Vix' in data.columns: data = data.rename(columns={'Vix': 'VIX'})
|
| 112 |
+
|
| 113 |
# --- QUANT BRANCH (7 Features) ---
|
| 114 |
tp = (data['High'] + data['Low'] + data['Close']) / 3
|
| 115 |
vwap = (tp * data['Volume']).rolling(20).sum() / (data['Volume'].rolling(20).sum() + 1e-9)
|
|
|
|
| 120 |
loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
|
| 121 |
data['RSI'] = (100 - (100 / (1 + (gain/(loss + 1e-9))))) / 100.0
|
| 122 |
|
| 123 |
+
ema_12 = data['Close'].ewm(span=12).mean()
|
| 124 |
+
ema_26 = data['Close'].ewm(span=26).mean()
|
| 125 |
+
macd = ema_12 - ema_26
|
| 126 |
+
data['MACD_Hist'] = (macd - macd.ewm(span=9).mean()) / data['Close']
|
| 127 |
|
| 128 |
data['VIX_Norm'] = data['VIX'] / 100.0
|
| 129 |
data['VIX_Change'] = data['VIX'].pct_change()
|
| 130 |
|
| 131 |
+
tr = pd.concat([data['High']-data['Low'],
|
| 132 |
+
abs(data['High']-data['Close'].shift()),
|
| 133 |
+
abs(data['Low']-data['Close'].shift())], axis=1).max(axis=1)
|
| 134 |
data['ATR_Dist'] = np.tanh((data['Close'] - data['Close'].rolling(22).mean()) / (tr.rolling(14).mean() + 1e-9))
|
| 135 |
data['Realized_Vol'] = data['Close'].pct_change().rolling(10).std() * 10
|
| 136 |
|
| 137 |
# --- SENTIMENT BRANCH (7 Features) ---
|
|
|
|
| 138 |
data.index = pd.to_datetime(data.index)
|
| 139 |
data = data.join(df_sent, how='left').fillna(0.0)
|
| 140 |
|
| 141 |
data['Sent_Mean_Delta'] = data['Sent_Mean'].diff().fillna(0.0)
|
| 142 |
data['Sent_Mean_EMA'] = data['Sent_Mean'].ewm(span=3).mean()
|
| 143 |
+
data['Sent_x_VIX'] = data['Sent_Mean'] * data['VIX_Norm']
|
| 144 |
|
| 145 |
feature_cols = [
|
| 146 |
'VWAP_Dist', 'RSI', 'MACD_Hist', 'VIX_Norm', 'VIX_Change', 'ATR_Dist', 'Realized_Vol',
|