Spaces:

DevKX
/

alpha-predict

Running

File size: 10,837 Bytes

1f009b3

import os
import time
import yfinance as yf
import pandas as pd
import finnhub
import streamlit as st
import requests
from dotenv import load_dotenv
from datetime import datetime, timedelta

# Load environment variables
load_dotenv()

class DataFetcher:
    def __init__(self, ticker="^GSPC", vix_ticker="%5EVIX"):
        self.ticker = ticker
        self.vix_ticker = vix_ticker
        
        # Load API Keys
        self.finnhub_key = os.getenv("FINNHUB_API_KEY")
        self.fmp_key = os.getenv("FMP_API_KEY")
        
        if not self.finnhub_key or not self.fmp_key:
            print("⚠️ Warning: API Keys missing! Check your .env file or HF Secrets.")
        
        # Initialize Finnhub Client for News
        self.finnhub_client = finnhub.Client(api_key=self.finnhub_key)

    def fetch_market_data(self, days=60):
        """Fetches live SPY data from the NEW FMP Stable API and merges VIX."""
        if not self.fmp_key:
            return self._load_backup(days)

        try:
            print(f"📡 Fetching live data for {self.ticker} from FMP Stable API...")
            
            spy_url = f"https://financialmodelingprep.com/stable/historical-price-eod/full?symbol={self.ticker}&apikey={self.fmp_key}"
            spy_res = requests.get(spy_url, timeout=10).json()
            
            if isinstance(spy_res, dict) and "Error Message" in spy_res:
                print(f"🚨 FMP Error: {spy_res['Error Message']}")
                return self._load_backup(days)
            
            if not isinstance(spy_res, list) or len(spy_res) == 0:
                 return self._load_backup(days)
                 
            # Format main DataFrame
            df = pd.DataFrame(spy_res)
            
            # 🛡️ THE FIX: Convert to datetime, strip timezones, and set to midnight
            df['date'] = pd.to_datetime(df['date'])
            if df['date'].dt.tz is not None:
                df['date'] = df['date'].dt.tz_localize(None)
            df['date'] = df['date'].dt.normalize()
            
            df.set_index('date', inplace=True)
            df = df.sort_index()[['open', 'high', 'low', 'close', 'volume']]
            df.columns = [c.capitalize() for c in df.columns]

            # Add VIX 
            df['VIX'] = self._get_vix_data()
            df['VIX'] = df['VIX'].ffill().bfill()

            print("✅ Live market data fetched and merged successfully!")
            return df.tail(days)

        except Exception as e:
            print(f"🚨 Major Fetch Error: {e}")
            return self._load_backup(days)

    def _get_vix_data(self):
        """Attempts to fetch VIX from Stable API, falls back to CSV if blocked."""
        print("📡 Attempting to fetch VIX from FMP Stable API...")
        try:
            vix_url = f"https://financialmodelingprep.com/stable/historical-price-eod/full?symbol={self.vix_ticker}&apikey={self.fmp_key}"
            vix_res = requests.get(vix_url, timeout=5).json()
            
            if isinstance(vix_res, list) and len(vix_res) > 0:
                vix_df = pd.DataFrame(vix_res)
                
                # 🛡️ THE FIX: Strip timezones for VIX so it perfectly matches SPY
                vix_df['date'] = pd.to_datetime(vix_df['date'])
                if vix_df['date'].dt.tz is not None:
                    vix_df['date'] = vix_df['date'].dt.tz_localize(None)
                vix_df['date'] = vix_df['date'].dt.normalize()
                
                vix_df.set_index('date', inplace=True)
                vix_df = vix_df.sort_index()
                print("✅ VIX fetched successfully from FMP!")
                return vix_df['close']
        except Exception as e:
            print(f"⚠️ VIX API request failed: {e}")
            
        print("⚠️ Pulling VIX from local backup...")
        backup_path = "data/market_data_backup.csv"
        
        if os.path.exists(backup_path):
            backup_df = pd.read_csv(backup_path, index_col=0, parse_dates=True)
            # Strip timezones from the backup CSV index as well!
            if backup_df.index.tz is not None:
                backup_df.index = backup_df.index.tz_localize(None)
            backup_df.index = backup_df.index.normalize()
            
            if 'VIX' in backup_df.columns:
                return backup_df['VIX']
        
        return 18.0

    def _load_backup(self, days):
        """Failsafe method to load local CSV if API entirely blocks the request."""
        print(f"📁 System: Loading localized market data backup...")
        backup_path = "data/market_data_backup.csv"
        if not os.path.exists(backup_path):
            print("🚨 Market backup CSV not found!")
            return pd.DataFrame() 
        df = pd.read_csv(backup_path, index_col=0, parse_dates=True)
        return df.tail(days)
        
            
        
    # def fetch_market_data(self, days=50):
    #     """
    #     Fetches market data using Finnhub (SPY as proxy) with a CSV fallback.
    #     """
    #     print(f"📡 Attempting to fetch last {days} days from Finnhub (using SPY proxy)...")
        
    #     try:
    #         # 1. Setup Timestamps (Finnhub needs Unix seconds)
    #         end_ts = int(time.time())
    #         start_ts = int((datetime.now() - timedelta(days=days+10)).timestamp())

    #         # 2. Fetch SPY (S&P 500 Proxy)
    #         # '1' means daily candles
    #         res = self.finnhub_client.stock_candles('SPY', 'D', start_ts, end_ts)
            
    #         if res.get('s') != 'ok':
    #             raise ValueError(f"Finnhub API returned status: {res.get('s')}")

    #         # Convert Finnhub response to DataFrame
    #         df = pd.DataFrame({
    #             'Date': pd.to_datetime(res['t'], unit='s'),
    #             'Close': res['c'],
    #             'Open': res['o'],
    #             'High': res['h'],
    #             'Low': res['l'],
    #             'Volume': res['v']
    #         }).set_index('Date')

    #         # 3. Handle VIX (Finnhub free tier often blocks ^VIX)
    #         # We attempt it, but if it fails, we merge from our backup data
    #         try:
    #             vix_res = self.finnhub_client.stock_candles('VIX', 'D', start_ts, end_ts)
    #             if vix_res.get('s') == 'ok':
    #                 df['VIX'] = vix_res['c']
    #             else:
    #                 raise Exception("VIX not available on API")
    #         except:
    #             print("⚠️ VIX not available on Finnhub. Pulling VIX from backup...")
    #             backup_df = pd.read_csv("data/market_data_backup.csv", index_col=0, parse_dates=True)
    #             # Reindex backup to match the dates we just got from the API
    #             df['VIX'] = backup_df['VIX'].reindex(df.index, method='ffill')

    #         # Final cleanup
    #         df = df.ffill().dropna()
            
    #         if df.empty:
    #             raise ValueError("Resulting DataFrame is empty.")

    #         return df

    #     except Exception as e:
    #         print(f"⚠️ Finnhub fetch failed ({e}). Loading full backup from data/ folder...")
    #         backup_path = "data/market_data_backup.csv"
            
    #         if not os.path.exists(backup_path):
    #             print(f"🚨 FATAL: {backup_path} not found!")
    #             return pd.DataFrame() # This will trigger your safety check in Processor
                
    #         df_backup = pd.read_csv(backup_path, index_col=0, parse_dates=True)
    #         return df_backup.tail(days)

    # 🛡️ STREAMLIT CACHE: Ignores '_self' so it doesn't try to hash the Finnhub client.
    # ttl=3600 caches the news for 1 hour so repeated button clicks load instantly.
    @st.cache_data(ttl=3600, show_spinner=False)
    def fetch_market_news(_self, days=45):
        """
        Fetches historical market news by looping through days.
        Uses 'SPY' as a proxy to allow historical date filtering on Finnhub.
        """
        print(f"📰 Fetching last {days} days of market headlines...")
        
        all_news = []
        end_date = datetime.now()
        
        # Try to render a Streamlit progress bar if running inside app.py
        try:
            progress_bar = st.progress(0, text="Fetching historical news data (avoiding rate limits)...")
        except:
            progress_bar = None

        # Loop backwards through time, day by day
        for i in range(days):
            target_date = end_date - timedelta(days=i)
            date_str = target_date.strftime('%Y-%m-%d')
            
            try:
                # FINNHUB TRICK: Use 'SPY' company news to get historical market coverage
                daily_news = _self.finnhub_client.company_news('SPY', _from=date_str, to=date_str)
                
                if daily_news:
                    all_news.extend(daily_news)
                
                # 🛑 RATE LIMIT SHIELD: Finnhub free tier allows 60 requests/minute.
                # Sleeping for 1.1 seconds guarantees we stay perfectly under the limit.
                time.sleep(1.1) 
                
            except Exception as e:
                print(f"⚠️ API Error on {date_str}: {e}")
                time.sleep(5) # Take a longer pause if the API gets angry
            
            # Update UI progress
            if progress_bar:
                progress_bar.progress((i + 1) / days, text=f"Fetched news for {date_str}...")

        # Clear the progress bar when finished
        if progress_bar:
            progress_bar.empty()

        # Convert the master list into a DataFrame
        df_news = pd.DataFrame(all_news)
        
        if df_news.empty:
            print("⚠️ No news found in the specified window.")
            return pd.DataFrame(columns=['Title', 'Date'])

        # Convert Unix timestamp to YYYY-MM-DD Date object
        df_news['Date'] = pd.to_datetime(df_news['datetime'], unit='s').dt.date

        # Rename columns to match what Processor expects
        df_news = df_news[['headline', 'Date']].rename(columns={'headline': 'Title'})
        
        # Drop duplicates in case of overlapping API returns
        df_news = df_news.drop_duplicates(subset=['Title', 'Date'])
        
        print(f"✅ Successfully fetched {len(df_news)} historical headlines.")
        return df_news

if __name__ == "__main__":
    fetcher = DataFetcher()
    
    # Test Market Fetch
    market_df = fetcher.fetch_market_data(days=50)
    print("\n--- Market Data Sample ---")
    print(market_df.tail())
    
    # Test News Fetch
    news_df = fetcher.fetch_market_news(days=45)
    print("\n--- Market News Sample ---")
    print(news_df.head())
    print(news_df.tail())
    print(f"\nTotal Headlines Fetched: {len(news_df)}")