DevKX commited on
Commit
6ccd6c8
·
verified ·
1 Parent(s): 6a10e86

Delete data_fetcher.py

Browse files
Files changed (1) hide show
  1. data_fetcher.py +0 -142
data_fetcher.py DELETED
@@ -1,142 +0,0 @@
1
- import os
2
- import time
3
- import yfinance as yf
4
- import pandas as pd
5
- import finnhub
6
- import streamlit as st
7
- from dotenv import load_dotenv
8
- from datetime import datetime, timedelta
9
-
10
- # Load environment variables
11
- load_dotenv()
12
-
13
- class DataFetcher:
14
- def __init__(self, ticker="^GSPC", vix_ticker="^VIX"):
15
- self.ticker = ticker
16
- self.vix_ticker = vix_ticker
17
-
18
- # Initialize Finnhub Client
19
- api_key = os.getenv("FINNHUB_API_KEY")
20
- if not api_key:
21
- raise ValueError("❌ FINNHUB_API_KEY not found in .env file!")
22
-
23
- self.finnhub_client = finnhub.Client(api_key=api_key)
24
-
25
- def fetch_market_data(self, days=50):
26
- """
27
- Fetches raw OHLCV and VIX data from Yahoo Finance.
28
- Falls back to local CSV in the data/ folder if Yahoo blocks the server IP.
29
- """
30
- print(f"📡 Attempting to fetch last {days} days of {self.ticker} and {self.vix_ticker}...")
31
-
32
- try:
33
- # 1. TRY TO FETCH LIVE DATA
34
- #df = yf.download(self.ticker, period=f"{days}d", interval="1d", progress=False)
35
- #df_vix = yf.download(self.vix_ticker, period=f"{days}d", interval="1d", progress=False)
36
-
37
- # Handle yfinance MultiIndex columns if they exist
38
- if isinstance(df.columns, pd.MultiIndex):
39
- df.columns = df.columns.get_level_values(0)
40
- if isinstance(df_vix.columns, pd.MultiIndex):
41
- df_vix.columns = df_vix.columns.get_level_values(0)
42
-
43
- df['VIX'] = df_vix['Close']
44
- df = df.ffill()
45
-
46
- # If the dataframe is empty (Yahoo stealth-blocked us), force an error
47
- if df.empty:
48
- raise ValueError("Yahoo Finance returned empty data.")
49
-
50
- return df
51
-
52
- except Exception as e:
53
- # 2. FALLBACK TO LOCAL CSV IF BLOCKED
54
- print(f"⚠️ Live fetch failed ({e}). Loading backup data from data/ folder...")
55
-
56
- # Load the CSV from your new data folder
57
- backup_path = "data/market_data_backup.csv"
58
- df_backup = pd.read_csv(backup_path, index_col=0, parse_dates=True)
59
-
60
- # Return only the requested number of days
61
- return df_backup.tail(days)
62
-
63
- # 🛡️ STREAMLIT CACHE: Ignores '_self' so it doesn't try to hash the Finnhub client.
64
- # ttl=3600 caches the news for 1 hour so repeated button clicks load instantly.
65
- @st.cache_data(ttl=3600, show_spinner=False)
66
- def fetch_market_news(_self, days=45):
67
- """
68
- Fetches historical market news by looping through days.
69
- Uses 'SPY' as a proxy to allow historical date filtering on Finnhub.
70
- """
71
- print(f"📰 Fetching last {days} days of market headlines...")
72
-
73
- all_news = []
74
- end_date = datetime.now()
75
-
76
- # Try to render a Streamlit progress bar if running inside app.py
77
- try:
78
- progress_bar = st.progress(0, text="Fetching historical news data (avoiding rate limits)...")
79
- except:
80
- progress_bar = None
81
-
82
- # Loop backwards through time, day by day
83
- for i in range(days):
84
- target_date = end_date - timedelta(days=i)
85
- date_str = target_date.strftime('%Y-%m-%d')
86
-
87
- try:
88
- # FINNHUB TRICK: Use 'SPY' company news to get historical market coverage
89
- daily_news = _self.finnhub_client.company_news('SPY', _from=date_str, to=date_str)
90
-
91
- if daily_news:
92
- all_news.extend(daily_news)
93
-
94
- # 🛑 RATE LIMIT SHIELD: Finnhub free tier allows 60 requests/minute.
95
- # Sleeping for 1.1 seconds guarantees we stay perfectly under the limit.
96
- time.sleep(1.1)
97
-
98
- except Exception as e:
99
- print(f"⚠️ API Error on {date_str}: {e}")
100
- time.sleep(5) # Take a longer pause if the API gets angry
101
-
102
- # Update UI progress
103
- if progress_bar:
104
- progress_bar.progress((i + 1) / days, text=f"Fetched news for {date_str}...")
105
-
106
- # Clear the progress bar when finished
107
- if progress_bar:
108
- progress_bar.empty()
109
-
110
- # Convert the master list into a DataFrame
111
- df_news = pd.DataFrame(all_news)
112
-
113
- if df_news.empty:
114
- print("⚠️ No news found in the specified window.")
115
- return pd.DataFrame(columns=['Title', 'Date'])
116
-
117
- # Convert Unix timestamp to YYYY-MM-DD Date object
118
- df_news['Date'] = pd.to_datetime(df_news['datetime'], unit='s').dt.date
119
-
120
- # Rename columns to match what Processor expects
121
- df_news = df_news[['headline', 'Date']].rename(columns={'headline': 'Title'})
122
-
123
- # Drop duplicates in case of overlapping API returns
124
- df_news = df_news.drop_duplicates(subset=['Title', 'Date'])
125
-
126
- print(f"✅ Successfully fetched {len(df_news)} historical headlines.")
127
- return df_news
128
-
129
- if __name__ == "__main__":
130
- fetcher = DataFetcher()
131
-
132
- # Test Market Fetch
133
- market_df = fetcher.fetch_market_data(days=50)
134
- print("\n--- Market Data Sample ---")
135
- print(market_df.tail())
136
-
137
- # Test News Fetch
138
- news_df = fetcher.fetch_market_news(days=45)
139
- print("\n--- Market News Sample ---")
140
- print(news_df.head())
141
- print(news_df.tail())
142
- print(f"\nTotal Headlines Fetched: {len(news_df)}")