DevKX commited on
Commit
2d5897e
·
verified ·
1 Parent(s): 2027a5d

Upload 2 files

Browse files
Files changed (2) hide show
  1. src/data_fetcher.py +67 -49
  2. src/processor.py +49 -23
src/data_fetcher.py CHANGED
@@ -24,64 +24,82 @@ class DataFetcher:
24
 
25
  def fetch_market_data(self, days=50):
26
  """
27
- Fetches market data using Finnhub (SPY as proxy) with a CSV fallback.
28
  """
29
- print(f"📡 Attempting to fetch last {days} days from Finnhub (using SPY proxy)...")
 
 
 
 
 
 
 
30
 
31
- try:
32
- # 1. Setup Timestamps (Finnhub needs Unix seconds)
33
- end_ts = int(time.time())
34
- start_ts = int((datetime.now() - timedelta(days=days+10)).timestamp())
35
 
36
- # 2. Fetch SPY (S&P 500 Proxy)
37
- # '1' means daily candles
38
- res = self.finnhub_client.stock_candles('SPY', 'D', start_ts, end_ts)
39
 
40
- if res.get('s') != 'ok':
41
- raise ValueError(f"Finnhub API returned status: {res.get('s')}")
42
-
43
- # Convert Finnhub response to DataFrame
44
- df = pd.DataFrame({
45
- 'Date': pd.to_datetime(res['t'], unit='s'),
46
- 'Close': res['c'],
47
- 'Open': res['o'],
48
- 'High': res['h'],
49
- 'Low': res['l'],
50
- 'Volume': res['v']
51
- }).set_index('Date')
52
-
53
- # 3. Handle VIX (Finnhub free tier often blocks ^VIX)
54
- # We attempt it, but if it fails, we merge from our backup data
55
- try:
56
- vix_res = self.finnhub_client.stock_candles('VIX', 'D', start_ts, end_ts)
57
- if vix_res.get('s') == 'ok':
58
- df['VIX'] = vix_res['c']
59
- else:
60
- raise Exception("VIX not available on API")
61
- except:
62
- print("⚠️ VIX not available on Finnhub. Pulling VIX from backup...")
63
- backup_df = pd.read_csv("data/market_data_backup.csv", index_col=0, parse_dates=True)
64
- # Reindex backup to match the dates we just got from the API
65
- df['VIX'] = backup_df['VIX'].reindex(df.index, method='ffill')
66
-
67
- # Final cleanup
68
- df = df.ffill().dropna()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- if df.empty:
71
- raise ValueError("Resulting DataFrame is empty.")
72
 
73
- return df
74
 
75
- except Exception as e:
76
- print(f"⚠️ Finnhub fetch failed ({e}). Loading full backup from data/ folder...")
77
- backup_path = "data/market_data_backup.csv"
78
 
79
- if not os.path.exists(backup_path):
80
- print(f"🚨 FATAL: {backup_path} not found!")
81
- return pd.DataFrame() # This will trigger your safety check in Processor
82
 
83
- df_backup = pd.read_csv(backup_path, index_col=0, parse_dates=True)
84
- return df_backup.tail(days)
85
 
86
  # 🛡️ STREAMLIT CACHE: Ignores '_self' so it doesn't try to hash the Finnhub client.
87
  # ttl=3600 caches the news for 1 hour so repeated button clicks load instantly.
 
24
 
25
  def fetch_market_data(self, days=50):
26
  """
27
+ Exclusively loads market data from backup to ensure 100% uptime for demo.
28
  """
29
+ print(f"📁 System: API bypassed. Loading localized market data...")
30
+ backup_path = "data/market_data_backup.csv"
31
+
32
+ if not os.path.exists(backup_path):
33
+ print(f"🚨 FATAL: {backup_path} not found!")
34
+ return pd.DataFrame()
35
+
36
+ df = pd.read_csv(backup_path, index_col=0, parse_dates=True)
37
 
 
 
 
 
38
 
39
+ return df.tail(days)
40
+
 
41
 
42
+
43
+ # def fetch_market_data(self, days=50):
44
+ # """
45
+ # Fetches market data using Finnhub (SPY as proxy) with a CSV fallback.
46
+ # """
47
+ # print(f"📡 Attempting to fetch last {days} days from Finnhub (using SPY proxy)...")
48
+
49
+ # try:
50
+ # # 1. Setup Timestamps (Finnhub needs Unix seconds)
51
+ # end_ts = int(time.time())
52
+ # start_ts = int((datetime.now() - timedelta(days=days+10)).timestamp())
53
+
54
+ # # 2. Fetch SPY (S&P 500 Proxy)
55
+ # # '1' means daily candles
56
+ # res = self.finnhub_client.stock_candles('SPY', 'D', start_ts, end_ts)
57
+
58
+ # if res.get('s') != 'ok':
59
+ # raise ValueError(f"Finnhub API returned status: {res.get('s')}")
60
+
61
+ # # Convert Finnhub response to DataFrame
62
+ # df = pd.DataFrame({
63
+ # 'Date': pd.to_datetime(res['t'], unit='s'),
64
+ # 'Close': res['c'],
65
+ # 'Open': res['o'],
66
+ # 'High': res['h'],
67
+ # 'Low': res['l'],
68
+ # 'Volume': res['v']
69
+ # }).set_index('Date')
70
+
71
+ # # 3. Handle VIX (Finnhub free tier often blocks ^VIX)
72
+ # # We attempt it, but if it fails, we merge from our backup data
73
+ # try:
74
+ # vix_res = self.finnhub_client.stock_candles('VIX', 'D', start_ts, end_ts)
75
+ # if vix_res.get('s') == 'ok':
76
+ # df['VIX'] = vix_res['c']
77
+ # else:
78
+ # raise Exception("VIX not available on API")
79
+ # except:
80
+ # print("⚠️ VIX not available on Finnhub. Pulling VIX from backup...")
81
+ # backup_df = pd.read_csv("data/market_data_backup.csv", index_col=0, parse_dates=True)
82
+ # # Reindex backup to match the dates we just got from the API
83
+ # df['VIX'] = backup_df['VIX'].reindex(df.index, method='ffill')
84
+
85
+ # # Final cleanup
86
+ # df = df.ffill().dropna()
87
 
88
+ # if df.empty:
89
+ # raise ValueError("Resulting DataFrame is empty.")
90
 
91
+ # return df
92
 
93
+ # except Exception as e:
94
+ # print(f"⚠️ Finnhub fetch failed ({e}). Loading full backup from data/ folder...")
95
+ # backup_path = "data/market_data_backup.csv"
96
 
97
+ # if not os.path.exists(backup_path):
98
+ # print(f"🚨 FATAL: {backup_path} not found!")
99
+ # return pd.DataFrame() # This will trigger your safety check in Processor
100
 
101
+ # df_backup = pd.read_csv(backup_path, index_col=0, parse_dates=True)
102
+ # return df_backup.tail(days)
103
 
104
  # 🛡️ STREAMLIT CACHE: Ignores '_self' so it doesn't try to hash the Finnhub client.
105
  # ttl=3600 caches the news for 1 hour so repeated button clicks load instantly.
src/processor.py CHANGED
@@ -1,8 +1,11 @@
 
1
  import pandas as pd
2
  import numpy as np
3
  import torch
4
- from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
5
  import joblib
 
 
 
6
 
7
  class Processor:
8
  def __init__(self, scaler_path="models/robust_scaler.pkl"):
@@ -10,14 +13,14 @@ class Processor:
10
  self.device = 0 if torch.cuda.is_available() else -1
11
  self.model_name = "ProsusAI/finbert"
12
 
13
- # Load Scaler (Required for normalization before GRU)
14
  try:
15
  self.scaler = joblib.load(scaler_path)
16
  print(f"✅ Scaler loaded from {scaler_path}")
17
  except:
18
- print("⚠️ Scaler not found. Ensure robust_scaler.pkl is in models/ folder.")
19
-
20
- # Initialize FinBERT Pipeline
21
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
22
  self.model = AutoModelForSequenceClassification.from_pretrained(
23
  self.model_name, use_safetensors=True
@@ -29,35 +32,57 @@ class Processor:
29
  device=self.device
30
  )
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def process(self, df_market, df_news):
33
  """
34
- Modified to return metadata and historical features for Streamlit display.
35
  """
36
- # 1. Process Sentiment features from headlines
37
  df_sent, df_news_scored = self._generate_sentiment_profile(df_news)
38
 
39
- # 2. Merge with market data and engineer all 14 features
40
  df_features = self._engineer_14_features(df_market, df_sent)
41
 
42
- # 3. Extract metadata for the UI (latest day's values)
43
  latest_metrics = {
44
  "Sent_Mean": df_features['Sent_Mean'].iloc[-1],
45
- "News_Volume": np.exp(df_features['News_Volume'].iloc[-1]) - 1, # Reverse log
46
  "Panic_Interaction": df_features['Sent_x_VIX'].iloc[-1],
47
  "RSI": df_features['RSI'].iloc[-1] * 100
48
  }
49
 
50
- # 4. Get the last 30 days and scale for the Model
51
  final_window = df_features.tail(30).values
52
  scaled_window = self.scaler.transform(final_window)
53
  input_tensor = np.expand_dims(scaled_window, axis=0).astype('float32')
54
 
55
- # FIX: We now return df_features so app.py can plot the historical 30-day sentiment
56
  return input_tensor, latest_metrics, df_features, df_news_scored
57
 
58
  def _generate_sentiment_profile(self, df_news):
59
  print("🧠 Running FinBERT Batch Analysis...")
60
  titles = df_news['Title'].astype(str).tolist()
 
 
61
  results = self.sentiment_pipe(titles, batch_size=32, truncation=True)
62
 
63
  scores = []
@@ -65,9 +90,7 @@ class Processor:
65
  label, score = res['label'].lower(), res['score']
66
  scores.append(score if label == 'positive' else -score if label == 'negative' else 0.0)
67
 
68
- df_news['Score'] = scores # Add scores to the raw news df
69
-
70
- # Ensure dates match format for grouping
71
  df_news['Date'] = pd.to_datetime(df_news['Date']).dt.date
72
  grouped = df_news.groupby('Date')['Score']
73
 
@@ -78,14 +101,15 @@ class Processor:
78
  'Net_Bull': grouped.apply(lambda x: x.sum() / (len(x) + 1))
79
  }).fillna(0.0)
80
 
81
- # Convert index back to datetime for merging
82
  daily.index = pd.to_datetime(daily.index)
83
-
84
  return daily, df_news
85
 
86
  def _engineer_14_features(self, df, df_sent):
87
  data = df.copy()
88
 
 
 
 
89
  # --- QUANT BRANCH (7 Features) ---
90
  tp = (data['High'] + data['Low'] + data['Close']) / 3
91
  vwap = (tp * data['Volume']).rolling(20).sum() / (data['Volume'].rolling(20).sum() + 1e-9)
@@ -96,25 +120,27 @@ class Processor:
96
  loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
97
  data['RSI'] = (100 - (100 / (1 + (gain/(loss + 1e-9))))) / 100.0
98
 
99
- ema_12, ema_26 = data['Close'].ewm(span=12).mean(), data['Close'].ewm(span=26).mean()
100
- data['MACD_Hist'] = ((ema_12 - ema_26) - (ema_12 - ema_26).ewm(span=9).mean()) / data['Close']
 
 
101
 
102
  data['VIX_Norm'] = data['VIX'] / 100.0
103
  data['VIX_Change'] = data['VIX'].pct_change()
104
 
105
- tr = pd.concat([data['High']-data['Low'], abs(data['High']-data['Close'].shift()),
106
- abs(data['Low']-data['Close'].shift())], axis=1).max(axis=1)
 
107
  data['ATR_Dist'] = np.tanh((data['Close'] - data['Close'].rolling(22).mean()) / (tr.rolling(14).mean() + 1e-9))
108
  data['Realized_Vol'] = data['Close'].pct_change().rolling(10).std() * 10
109
 
110
  # --- SENTIMENT BRANCH (7 Features) ---
111
- # Ensure indices match for joining
112
  data.index = pd.to_datetime(data.index)
113
  data = data.join(df_sent, how='left').fillna(0.0)
114
 
115
  data['Sent_Mean_Delta'] = data['Sent_Mean'].diff().fillna(0.0)
116
  data['Sent_Mean_EMA'] = data['Sent_Mean'].ewm(span=3).mean()
117
- data['Sent_x_VIX'] = data['Sent_Mean'] * data['VIX_Norm'] # Panic Interaction
118
 
119
  feature_cols = [
120
  'VWAP_Dist', 'RSI', 'MACD_Hist', 'VIX_Norm', 'VIX_Change', 'ATR_Dist', 'Realized_Vol',
 
1
+ import os
2
  import pandas as pd
3
  import numpy as np
4
  import torch
 
5
  import joblib
6
+ import time
7
+ from datetime import datetime
8
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
9
 
10
  class Processor:
11
  def __init__(self, scaler_path="models/robust_scaler.pkl"):
 
13
  self.device = 0 if torch.cuda.is_available() else -1
14
  self.model_name = "ProsusAI/finbert"
15
 
16
+ # 1. Load Scaler
17
  try:
18
  self.scaler = joblib.load(scaler_path)
19
  print(f"✅ Scaler loaded from {scaler_path}")
20
  except:
21
+ print("⚠️ Scaler not found in models/ folder.")
22
+
23
+ # 2. Initialize FinBERT
24
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
25
  self.model = AutoModelForSequenceClassification.from_pretrained(
26
  self.model_name, use_safetensors=True
 
32
  device=self.device
33
  )
34
 
35
+ def fetch_market_data(self, days=60):
36
+ """
37
+ Loads market data from your provided CSV backup.
38
+ Bypasses Finnhub to avoid 403 errors during presentation.
39
+ """
40
+ print(f"📁 System: Bypassing API. Loading local market data...")
41
+ backup_path = "data/market_data_backup.csv"
42
+
43
+ if not os.path.exists(backup_path):
44
+ print(f"🚨 FATAL: {backup_path} not found!")
45
+ return pd.DataFrame()
46
+
47
+ df = pd.read_csv(backup_path, index_col=0, parse_dates=True)
48
+
49
+ # Optional: Sync dates to today for presentation realism
50
+ # last_date = df.index[-1]
51
+ # offset = pd.Timestamp(datetime.now().date()) - last_date
52
+ # df.index = df.index + offset
53
+
54
+ return df.tail(days)
55
+
56
  def process(self, df_market, df_news):
57
  """
58
+ Main pipeline: News Sentiment -> Feature Engineering -> GRU Input
59
  """
60
+ # 1. Process Sentiment from headlines
61
  df_sent, df_news_scored = self._generate_sentiment_profile(df_news)
62
 
63
+ # 2. Merge and engineer all 14 features
64
  df_features = self._engineer_14_features(df_market, df_sent)
65
 
66
+ # 3. Extract metadata for Streamlit UI
67
  latest_metrics = {
68
  "Sent_Mean": df_features['Sent_Mean'].iloc[-1],
69
+ "News_Volume": int(np.exp(df_features['News_Volume'].iloc[-1]) - 1),
70
  "Panic_Interaction": df_features['Sent_x_VIX'].iloc[-1],
71
  "RSI": df_features['RSI'].iloc[-1] * 100
72
  }
73
 
74
+ # 4. Prepare 30-day window for GRU
75
  final_window = df_features.tail(30).values
76
  scaled_window = self.scaler.transform(final_window)
77
  input_tensor = np.expand_dims(scaled_window, axis=0).astype('float32')
78
 
 
79
  return input_tensor, latest_metrics, df_features, df_news_scored
80
 
81
  def _generate_sentiment_profile(self, df_news):
82
  print("🧠 Running FinBERT Batch Analysis...")
83
  titles = df_news['Title'].astype(str).tolist()
84
+
85
+ # Batch processing to handle 1700+ headlines efficiently
86
  results = self.sentiment_pipe(titles, batch_size=32, truncation=True)
87
 
88
  scores = []
 
90
  label, score = res['label'].lower(), res['score']
91
  scores.append(score if label == 'positive' else -score if label == 'negative' else 0.0)
92
 
93
+ df_news['Score'] = scores
 
 
94
  df_news['Date'] = pd.to_datetime(df_news['Date']).dt.date
95
  grouped = df_news.groupby('Date')['Score']
96
 
 
101
  'Net_Bull': grouped.apply(lambda x: x.sum() / (len(x) + 1))
102
  }).fillna(0.0)
103
 
 
104
  daily.index = pd.to_datetime(daily.index)
 
105
  return daily, df_news
106
 
107
  def _engineer_14_features(self, df, df_sent):
108
  data = df.copy()
109
 
110
+ data.columns = [c.capitalize() for c in data.columns]
111
+ if 'Vix' in data.columns: data = data.rename(columns={'Vix': 'VIX'})
112
+
113
  # --- QUANT BRANCH (7 Features) ---
114
  tp = (data['High'] + data['Low'] + data['Close']) / 3
115
  vwap = (tp * data['Volume']).rolling(20).sum() / (data['Volume'].rolling(20).sum() + 1e-9)
 
120
  loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
121
  data['RSI'] = (100 - (100 / (1 + (gain/(loss + 1e-9))))) / 100.0
122
 
123
+ ema_12 = data['Close'].ewm(span=12).mean()
124
+ ema_26 = data['Close'].ewm(span=26).mean()
125
+ macd = ema_12 - ema_26
126
+ data['MACD_Hist'] = (macd - macd.ewm(span=9).mean()) / data['Close']
127
 
128
  data['VIX_Norm'] = data['VIX'] / 100.0
129
  data['VIX_Change'] = data['VIX'].pct_change()
130
 
131
+ tr = pd.concat([data['High']-data['Low'],
132
+ abs(data['High']-data['Close'].shift()),
133
+ abs(data['Low']-data['Close'].shift())], axis=1).max(axis=1)
134
  data['ATR_Dist'] = np.tanh((data['Close'] - data['Close'].rolling(22).mean()) / (tr.rolling(14).mean() + 1e-9))
135
  data['Realized_Vol'] = data['Close'].pct_change().rolling(10).std() * 10
136
 
137
  # --- SENTIMENT BRANCH (7 Features) ---
 
138
  data.index = pd.to_datetime(data.index)
139
  data = data.join(df_sent, how='left').fillna(0.0)
140
 
141
  data['Sent_Mean_Delta'] = data['Sent_Mean'].diff().fillna(0.0)
142
  data['Sent_Mean_EMA'] = data['Sent_Mean'].ewm(span=3).mean()
143
+ data['Sent_x_VIX'] = data['Sent_Mean'] * data['VIX_Norm']
144
 
145
  feature_cols = [
146
  'VWAP_Dist', 'RSI', 'MACD_Hist', 'VIX_Norm', 'VIX_Change', 'ATR_Dist', 'Realized_Vol',