Spaces:

DevKX
/

alpha-predict

Sleeping

App Files Files Community

DevKX commited on Feb 25

Commit

2d5897e

verified ·

1 Parent(s): 2027a5d

Upload 2 files

Browse files

Files changed (2) hide show

src/data_fetcher.py +67 -49
src/processor.py +49 -23

src/data_fetcher.py CHANGED Viewed

@@ -24,64 +24,82 @@ class DataFetcher:
     def fetch_market_data(self, days=50):
         """
-        Fetches market data using Finnhub (SPY as proxy) with a CSV fallback.
         """
-        print(f"📡 Attempting to fetch last {days} days from Finnhub (using SPY proxy)...")
-        try:
-            # 1. Setup Timestamps (Finnhub needs Unix seconds)
-            end_ts = int(time.time())
-            start_ts = int((datetime.now() - timedelta(days=days+10)).timestamp())
-            # 2. Fetch SPY (S&P 500 Proxy)
-            # '1' means daily candles
-            res = self.finnhub_client.stock_candles('SPY', 'D', start_ts, end_ts)
-            if res.get('s') != 'ok':
-                raise ValueError(f"Finnhub API returned status: {res.get('s')}")
-            # Convert Finnhub response to DataFrame
-            df = pd.DataFrame({
-                'Date': pd.to_datetime(res['t'], unit='s'),
-                'Close': res['c'],
-                'Open': res['o'],
-                'High': res['h'],
-                'Low': res['l'],
-                'Volume': res['v']
-            }).set_index('Date')
-            # 3. Handle VIX (Finnhub free tier often blocks ^VIX)
-            # We attempt it, but if it fails, we merge from our backup data
-            try:
-                vix_res = self.finnhub_client.stock_candles('VIX', 'D', start_ts, end_ts)
-                if vix_res.get('s') == 'ok':
-                    df['VIX'] = vix_res['c']
-                else:
-                    raise Exception("VIX not available on API")
-            except:
-                print("⚠️ VIX not available on Finnhub. Pulling VIX from backup...")
-                backup_df = pd.read_csv("data/market_data_backup.csv", index_col=0, parse_dates=True)
-                # Reindex backup to match the dates we just got from the API
-                df['VIX'] = backup_df['VIX'].reindex(df.index, method='ffill')
-            # Final cleanup
-            df = df.ffill().dropna()
-            if df.empty:
-                raise ValueError("Resulting DataFrame is empty.")
-            return df
-        except Exception as e:
-            print(f"⚠️ Finnhub fetch failed ({e}). Loading full backup from data/ folder...")
-            backup_path = "data/market_data_backup.csv"
-            if not os.path.exists(backup_path):
-                print(f"🚨 FATAL: {backup_path} not found!")
-                return pd.DataFrame() # This will trigger your safety check in Processor
-            df_backup = pd.read_csv(backup_path, index_col=0, parse_dates=True)
-            return df_backup.tail(days)
     # 🛡️ STREAMLIT CACHE: Ignores '_self' so it doesn't try to hash the Finnhub client.
     # ttl=3600 caches the news for 1 hour so repeated button clicks load instantly.

     def fetch_market_data(self, days=50):
         """
+        Exclusively loads market data from backup to ensure 100% uptime for demo.
         """
+        print(f"📁 System: API bypassed. Loading localized market data...")
+        backup_path = "data/market_data_backup.csv"
+        if not os.path.exists(backup_path):
+            print(f"🚨 FATAL: {backup_path} not found!")
+            return pd.DataFrame()
+        df = pd.read_csv(backup_path, index_col=0, parse_dates=True)
+        return df.tail(days)
+    # def fetch_market_data(self, days=50):
+    #     """
+    #     Fetches market data using Finnhub (SPY as proxy) with a CSV fallback.
+    #     """
+    #     print(f"📡 Attempting to fetch last {days} days from Finnhub (using SPY proxy)...")
+    #     try:
+    #         # 1. Setup Timestamps (Finnhub needs Unix seconds)
+    #         end_ts = int(time.time())
+    #         start_ts = int((datetime.now() - timedelta(days=days+10)).timestamp())
+    #         # 2. Fetch SPY (S&P 500 Proxy)
+    #         # '1' means daily candles
+    #         res = self.finnhub_client.stock_candles('SPY', 'D', start_ts, end_ts)
+    #         if res.get('s') != 'ok':
+    #             raise ValueError(f"Finnhub API returned status: {res.get('s')}")
+    #         # Convert Finnhub response to DataFrame
+    #         df = pd.DataFrame({
+    #             'Date': pd.to_datetime(res['t'], unit='s'),
+    #             'Close': res['c'],
+    #             'Open': res['o'],
+    #             'High': res['h'],
+    #             'Low': res['l'],
+    #             'Volume': res['v']
+    #         }).set_index('Date')
+    #         # 3. Handle VIX (Finnhub free tier often blocks ^VIX)
+    #         # We attempt it, but if it fails, we merge from our backup data
+    #         try:
+    #             vix_res = self.finnhub_client.stock_candles('VIX', 'D', start_ts, end_ts)
+    #             if vix_res.get('s') == 'ok':
+    #                 df['VIX'] = vix_res['c']
+    #             else:
+    #                 raise Exception("VIX not available on API")
+    #         except:
+    #             print("⚠️ VIX not available on Finnhub. Pulling VIX from backup...")
+    #             backup_df = pd.read_csv("data/market_data_backup.csv", index_col=0, parse_dates=True)
+    #             # Reindex backup to match the dates we just got from the API
+    #             df['VIX'] = backup_df['VIX'].reindex(df.index, method='ffill')
+    #         # Final cleanup
+    #         df = df.ffill().dropna()
+    #         if df.empty:
+    #             raise ValueError("Resulting DataFrame is empty.")
+    #         return df
+    #     except Exception as e:
+    #         print(f"⚠️ Finnhub fetch failed ({e}). Loading full backup from data/ folder...")
+    #         backup_path = "data/market_data_backup.csv"
+    #         if not os.path.exists(backup_path):
+    #             print(f"🚨 FATAL: {backup_path} not found!")
+    #             return pd.DataFrame() # This will trigger your safety check in Processor
+    #         df_backup = pd.read_csv(backup_path, index_col=0, parse_dates=True)
+    #         return df_backup.tail(days)
     # 🛡️ STREAMLIT CACHE: Ignores '_self' so it doesn't try to hash the Finnhub client.
     # ttl=3600 caches the news for 1 hour so repeated button clicks load instantly.

src/processor.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import pandas as pd
 import numpy as np
 import torch
-from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
 import joblib
 class Processor:
     def __init__(self, scaler_path="models/robust_scaler.pkl"):
@@ -10,14 +13,14 @@ class Processor:
         self.device = 0 if torch.cuda.is_available() else -1
         self.model_name = "ProsusAI/finbert"
-        # Load Scaler (Required for normalization before GRU)
         try:
             self.scaler = joblib.load(scaler_path)
             print(f"✅ Scaler loaded from {scaler_path}")
         except:
-            print("⚠️ Scaler not found. Ensure robust_scaler.pkl is in models/ folder.")
-        # Initialize FinBERT Pipeline
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         self.model = AutoModelForSequenceClassification.from_pretrained(
             self.model_name, use_safetensors=True
@@ -29,35 +32,57 @@ class Processor:
             device=self.device
         )
     def process(self, df_market, df_news):
         """
-        Modified to return metadata and historical features for Streamlit display.
         """
-        # 1. Process Sentiment features from headlines
         df_sent, df_news_scored = self._generate_sentiment_profile(df_news)
-        # 2. Merge with market data and engineer all 14 features
         df_features = self._engineer_14_features(df_market, df_sent)
-        # 3. Extract metadata for the UI (latest day's values)
         latest_metrics = {
             "Sent_Mean": df_features['Sent_Mean'].iloc[-1],
-            "News_Volume": np.exp(df_features['News_Volume'].iloc[-1]) - 1, # Reverse log
             "Panic_Interaction": df_features['Sent_x_VIX'].iloc[-1],
             "RSI": df_features['RSI'].iloc[-1] * 100
         }
-        # 4. Get the last 30 days and scale for the Model
         final_window = df_features.tail(30).values
         scaled_window = self.scaler.transform(final_window)
         input_tensor = np.expand_dims(scaled_window, axis=0).astype('float32')
-        # FIX: We now return df_features so app.py can plot the historical 30-day sentiment
         return input_tensor, latest_metrics, df_features, df_news_scored
     def _generate_sentiment_profile(self, df_news):
         print("🧠 Running FinBERT Batch Analysis...")
         titles = df_news['Title'].astype(str).tolist()
         results = self.sentiment_pipe(titles, batch_size=32, truncation=True)
         scores = []
@@ -65,9 +90,7 @@ class Processor:
             label, score = res['label'].lower(), res['score']
             scores.append(score if label == 'positive' else -score if label == 'negative' else 0.0)
-        df_news['Score'] = scores # Add scores to the raw news df
-        # Ensure dates match format for grouping
         df_news['Date'] = pd.to_datetime(df_news['Date']).dt.date
         grouped = df_news.groupby('Date')['Score']
@@ -78,14 +101,15 @@ class Processor:
             'Net_Bull': grouped.apply(lambda x: x.sum() / (len(x) + 1))
         }).fillna(0.0)
-        # Convert index back to datetime for merging
         daily.index = pd.to_datetime(daily.index)
         return daily, df_news
     def _engineer_14_features(self, df, df_sent):
         data = df.copy()
         # --- QUANT BRANCH (7 Features) ---
         tp = (data['High'] + data['Low'] + data['Close']) / 3
         vwap = (tp * data['Volume']).rolling(20).sum() / (data['Volume'].rolling(20).sum() + 1e-9)
@@ -96,25 +120,27 @@ class Processor:
         loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
         data['RSI'] = (100 - (100 / (1 + (gain/(loss + 1e-9))))) / 100.0
-        ema_12, ema_26 = data['Close'].ewm(span=12).mean(), data['Close'].ewm(span=26).mean()
-        data['MACD_Hist'] = ((ema_12 - ema_26) - (ema_12 - ema_26).ewm(span=9).mean()) / data['Close']
         data['VIX_Norm'] = data['VIX'] / 100.0
         data['VIX_Change'] = data['VIX'].pct_change()
-        tr = pd.concat([data['High']-data['Low'], abs(data['High']-data['Close'].shift()),
-                       abs(data['Low']-data['Close'].shift())], axis=1).max(axis=1)
         data['ATR_Dist'] = np.tanh((data['Close'] - data['Close'].rolling(22).mean()) / (tr.rolling(14).mean() + 1e-9))
         data['Realized_Vol'] = data['Close'].pct_change().rolling(10).std() * 10
         # --- SENTIMENT BRANCH (7 Features) ---
-        # Ensure indices match for joining
         data.index = pd.to_datetime(data.index)
         data = data.join(df_sent, how='left').fillna(0.0)
         data['Sent_Mean_Delta'] = data['Sent_Mean'].diff().fillna(0.0)
         data['Sent_Mean_EMA'] = data['Sent_Mean'].ewm(span=3).mean()
-        data['Sent_x_VIX'] = data['Sent_Mean'] * data['VIX_Norm'] # Panic Interaction
         feature_cols = [
             'VWAP_Dist', 'RSI', 'MACD_Hist', 'VIX_Norm', 'VIX_Change', 'ATR_Dist', 'Realized_Vol',

+import os
 import pandas as pd
 import numpy as np
 import torch
 import joblib
+import time
+from datetime import datetime
+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
 class Processor:
     def __init__(self, scaler_path="models/robust_scaler.pkl"):
         self.device = 0 if torch.cuda.is_available() else -1
         self.model_name = "ProsusAI/finbert"
+        # 1. Load Scaler
         try:
             self.scaler = joblib.load(scaler_path)
             print(f"✅ Scaler loaded from {scaler_path}")
         except:
+            print("⚠️ Scaler not found in models/ folder.")
+        # 2. Initialize FinBERT
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         self.model = AutoModelForSequenceClassification.from_pretrained(
             self.model_name, use_safetensors=True
             device=self.device
         )
+    def fetch_market_data(self, days=60):
+        """
+        Loads market data from your provided CSV backup.
+        Bypasses Finnhub to avoid 403 errors during presentation.
+        """
+        print(f"📁 System: Bypassing API. Loading local market data...")
+        backup_path = "data/market_data_backup.csv"
+        if not os.path.exists(backup_path):
+            print(f"🚨 FATAL: {backup_path} not found!")
+            return pd.DataFrame()
+        df = pd.read_csv(backup_path, index_col=0, parse_dates=True)
+        # Optional: Sync dates to today for presentation realism
+        # last_date = df.index[-1]
+        # offset = pd.Timestamp(datetime.now().date()) - last_date
+        # df.index = df.index + offset
+        return df.tail(days)
     def process(self, df_market, df_news):
         """
+        Main pipeline: News Sentiment -> Feature Engineering -> GRU Input
         """
+        # 1. Process Sentiment from headlines
         df_sent, df_news_scored = self._generate_sentiment_profile(df_news)
+        # 2. Merge and engineer all 14 features
         df_features = self._engineer_14_features(df_market, df_sent)
+        # 3. Extract metadata for Streamlit UI
         latest_metrics = {
             "Sent_Mean": df_features['Sent_Mean'].iloc[-1],
+            "News_Volume": int(np.exp(df_features['News_Volume'].iloc[-1]) - 1),
             "Panic_Interaction": df_features['Sent_x_VIX'].iloc[-1],
             "RSI": df_features['RSI'].iloc[-1] * 100
         }
+        # 4. Prepare 30-day window for GRU
         final_window = df_features.tail(30).values
         scaled_window = self.scaler.transform(final_window)
         input_tensor = np.expand_dims(scaled_window, axis=0).astype('float32')
         return input_tensor, latest_metrics, df_features, df_news_scored
     def _generate_sentiment_profile(self, df_news):
         print("🧠 Running FinBERT Batch Analysis...")
         titles = df_news['Title'].astype(str).tolist()
+        # Batch processing to handle 1700+ headlines efficiently
         results = self.sentiment_pipe(titles, batch_size=32, truncation=True)
         scores = []
             label, score = res['label'].lower(), res['score']
             scores.append(score if label == 'positive' else -score if label == 'negative' else 0.0)
+        df_news['Score'] = scores
         df_news['Date'] = pd.to_datetime(df_news['Date']).dt.date
         grouped = df_news.groupby('Date')['Score']
             'Net_Bull': grouped.apply(lambda x: x.sum() / (len(x) + 1))
         }).fillna(0.0)
         daily.index = pd.to_datetime(daily.index)
         return daily, df_news
     def _engineer_14_features(self, df, df_sent):
         data = df.copy()
+        data.columns = [c.capitalize() for c in data.columns]
+        if 'Vix' in data.columns: data = data.rename(columns={'Vix': 'VIX'})
         # --- QUANT BRANCH (7 Features) ---
         tp = (data['High'] + data['Low'] + data['Close']) / 3
         vwap = (tp * data['Volume']).rolling(20).sum() / (data['Volume'].rolling(20).sum() + 1e-9)
         loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
         data['RSI'] = (100 - (100 / (1 + (gain/(loss + 1e-9))))) / 100.0
+        ema_12 = data['Close'].ewm(span=12).mean()
+        ema_26 = data['Close'].ewm(span=26).mean()
+        macd = ema_12 - ema_26
+        data['MACD_Hist'] = (macd - macd.ewm(span=9).mean()) / data['Close']
         data['VIX_Norm'] = data['VIX'] / 100.0
         data['VIX_Change'] = data['VIX'].pct_change()
+        tr = pd.concat([data['High']-data['Low'],
+                        abs(data['High']-data['Close'].shift()),
+                        abs(data['Low']-data['Close'].shift())], axis=1).max(axis=1)
         data['ATR_Dist'] = np.tanh((data['Close'] - data['Close'].rolling(22).mean()) / (tr.rolling(14).mean() + 1e-9))
         data['Realized_Vol'] = data['Close'].pct_change().rolling(10).std() * 10
         # --- SENTIMENT BRANCH (7 Features) ---
         data.index = pd.to_datetime(data.index)
         data = data.join(df_sent, how='left').fillna(0.0)
         data['Sent_Mean_Delta'] = data['Sent_Mean'].diff().fillna(0.0)
         data['Sent_Mean_EMA'] = data['Sent_Mean'].ewm(span=3).mean()
+        data['Sent_x_VIX'] = data['Sent_Mean'] * data['VIX_Norm']
         feature_cols = [
             'VWAP_Dist', 'RSI', 'MACD_Hist', 'VIX_Norm', 'VIX_Change', 'ATR_Dist', 'Realized_Vol',