DevKX commited on
Commit
1f009b3
Β·
verified Β·
1 Parent(s): 2d5897e

Upload data_fetcher.py

Browse files
Files changed (1) hide show
  1. data_fetcher.py +261 -0
data_fetcher.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import yfinance as yf
4
+ import pandas as pd
5
+ import finnhub
6
+ import streamlit as st
7
+ import requests
8
+ from dotenv import load_dotenv
9
+ from datetime import datetime, timedelta
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ class DataFetcher:
15
+ def __init__(self, ticker="^GSPC", vix_ticker="%5EVIX"):
16
+ self.ticker = ticker
17
+ self.vix_ticker = vix_ticker
18
+
19
+ # Load API Keys
20
+ self.finnhub_key = os.getenv("FINNHUB_API_KEY")
21
+ self.fmp_key = os.getenv("FMP_API_KEY")
22
+
23
+ if not self.finnhub_key or not self.fmp_key:
24
+ print("⚠️ Warning: API Keys missing! Check your .env file or HF Secrets.")
25
+
26
+ # Initialize Finnhub Client for News
27
+ self.finnhub_client = finnhub.Client(api_key=self.finnhub_key)
28
+
29
+ def fetch_market_data(self, days=60):
30
+ """Fetches live SPY data from the NEW FMP Stable API and merges VIX."""
31
+ if not self.fmp_key:
32
+ return self._load_backup(days)
33
+
34
+ try:
35
+ print(f"πŸ“‘ Fetching live data for {self.ticker} from FMP Stable API...")
36
+
37
+ spy_url = f"https://financialmodelingprep.com/stable/historical-price-eod/full?symbol={self.ticker}&apikey={self.fmp_key}"
38
+ spy_res = requests.get(spy_url, timeout=10).json()
39
+
40
+ if isinstance(spy_res, dict) and "Error Message" in spy_res:
41
+ print(f"🚨 FMP Error: {spy_res['Error Message']}")
42
+ return self._load_backup(days)
43
+
44
+ if not isinstance(spy_res, list) or len(spy_res) == 0:
45
+ return self._load_backup(days)
46
+
47
+ # Format main DataFrame
48
+ df = pd.DataFrame(spy_res)
49
+
50
+ # πŸ›‘οΈ THE FIX: Convert to datetime, strip timezones, and set to midnight
51
+ df['date'] = pd.to_datetime(df['date'])
52
+ if df['date'].dt.tz is not None:
53
+ df['date'] = df['date'].dt.tz_localize(None)
54
+ df['date'] = df['date'].dt.normalize()
55
+
56
+ df.set_index('date', inplace=True)
57
+ df = df.sort_index()[['open', 'high', 'low', 'close', 'volume']]
58
+ df.columns = [c.capitalize() for c in df.columns]
59
+
60
+ # Add VIX
61
+ df['VIX'] = self._get_vix_data()
62
+ df['VIX'] = df['VIX'].ffill().bfill()
63
+
64
+ print("βœ… Live market data fetched and merged successfully!")
65
+ return df.tail(days)
66
+
67
+ except Exception as e:
68
+ print(f"🚨 Major Fetch Error: {e}")
69
+ return self._load_backup(days)
70
+
71
+ def _get_vix_data(self):
72
+ """Attempts to fetch VIX from Stable API, falls back to CSV if blocked."""
73
+ print("πŸ“‘ Attempting to fetch VIX from FMP Stable API...")
74
+ try:
75
+ vix_url = f"https://financialmodelingprep.com/stable/historical-price-eod/full?symbol={self.vix_ticker}&apikey={self.fmp_key}"
76
+ vix_res = requests.get(vix_url, timeout=5).json()
77
+
78
+ if isinstance(vix_res, list) and len(vix_res) > 0:
79
+ vix_df = pd.DataFrame(vix_res)
80
+
81
+ # πŸ›‘οΈ THE FIX: Strip timezones for VIX so it perfectly matches SPY
82
+ vix_df['date'] = pd.to_datetime(vix_df['date'])
83
+ if vix_df['date'].dt.tz is not None:
84
+ vix_df['date'] = vix_df['date'].dt.tz_localize(None)
85
+ vix_df['date'] = vix_df['date'].dt.normalize()
86
+
87
+ vix_df.set_index('date', inplace=True)
88
+ vix_df = vix_df.sort_index()
89
+ print("βœ… VIX fetched successfully from FMP!")
90
+ return vix_df['close']
91
+ except Exception as e:
92
+ print(f"⚠️ VIX API request failed: {e}")
93
+
94
+ print("⚠️ Pulling VIX from local backup...")
95
+ backup_path = "data/market_data_backup.csv"
96
+
97
+ if os.path.exists(backup_path):
98
+ backup_df = pd.read_csv(backup_path, index_col=0, parse_dates=True)
99
+ # Strip timezones from the backup CSV index as well!
100
+ if backup_df.index.tz is not None:
101
+ backup_df.index = backup_df.index.tz_localize(None)
102
+ backup_df.index = backup_df.index.normalize()
103
+
104
+ if 'VIX' in backup_df.columns:
105
+ return backup_df['VIX']
106
+
107
+ return 18.0
108
+
109
+ def _load_backup(self, days):
110
+ """Failsafe method to load local CSV if API entirely blocks the request."""
111
+ print(f"πŸ“ System: Loading localized market data backup...")
112
+ backup_path = "data/market_data_backup.csv"
113
+ if not os.path.exists(backup_path):
114
+ print("🚨 Market backup CSV not found!")
115
+ return pd.DataFrame()
116
+ df = pd.read_csv(backup_path, index_col=0, parse_dates=True)
117
+ return df.tail(days)
118
+
119
+
120
+
121
+ # def fetch_market_data(self, days=50):
122
+ # """
123
+ # Fetches market data using Finnhub (SPY as proxy) with a CSV fallback.
124
+ # """
125
+ # print(f"πŸ“‘ Attempting to fetch last {days} days from Finnhub (using SPY proxy)...")
126
+
127
+ # try:
128
+ # # 1. Setup Timestamps (Finnhub needs Unix seconds)
129
+ # end_ts = int(time.time())
130
+ # start_ts = int((datetime.now() - timedelta(days=days+10)).timestamp())
131
+
132
+ # # 2. Fetch SPY (S&P 500 Proxy)
133
+ # # '1' means daily candles
134
+ # res = self.finnhub_client.stock_candles('SPY', 'D', start_ts, end_ts)
135
+
136
+ # if res.get('s') != 'ok':
137
+ # raise ValueError(f"Finnhub API returned status: {res.get('s')}")
138
+
139
+ # # Convert Finnhub response to DataFrame
140
+ # df = pd.DataFrame({
141
+ # 'Date': pd.to_datetime(res['t'], unit='s'),
142
+ # 'Close': res['c'],
143
+ # 'Open': res['o'],
144
+ # 'High': res['h'],
145
+ # 'Low': res['l'],
146
+ # 'Volume': res['v']
147
+ # }).set_index('Date')
148
+
149
+ # # 3. Handle VIX (Finnhub free tier often blocks ^VIX)
150
+ # # We attempt it, but if it fails, we merge from our backup data
151
+ # try:
152
+ # vix_res = self.finnhub_client.stock_candles('VIX', 'D', start_ts, end_ts)
153
+ # if vix_res.get('s') == 'ok':
154
+ # df['VIX'] = vix_res['c']
155
+ # else:
156
+ # raise Exception("VIX not available on API")
157
+ # except:
158
+ # print("⚠️ VIX not available on Finnhub. Pulling VIX from backup...")
159
+ # backup_df = pd.read_csv("data/market_data_backup.csv", index_col=0, parse_dates=True)
160
+ # # Reindex backup to match the dates we just got from the API
161
+ # df['VIX'] = backup_df['VIX'].reindex(df.index, method='ffill')
162
+
163
+ # # Final cleanup
164
+ # df = df.ffill().dropna()
165
+
166
+ # if df.empty:
167
+ # raise ValueError("Resulting DataFrame is empty.")
168
+
169
+ # return df
170
+
171
+ # except Exception as e:
172
+ # print(f"⚠️ Finnhub fetch failed ({e}). Loading full backup from data/ folder...")
173
+ # backup_path = "data/market_data_backup.csv"
174
+
175
+ # if not os.path.exists(backup_path):
176
+ # print(f"🚨 FATAL: {backup_path} not found!")
177
+ # return pd.DataFrame() # This will trigger your safety check in Processor
178
+
179
+ # df_backup = pd.read_csv(backup_path, index_col=0, parse_dates=True)
180
+ # return df_backup.tail(days)
181
+
182
+ # πŸ›‘οΈ STREAMLIT CACHE: Ignores '_self' so it doesn't try to hash the Finnhub client.
183
+ # ttl=3600 caches the news for 1 hour so repeated button clicks load instantly.
184
+ @st.cache_data(ttl=3600, show_spinner=False)
185
+ def fetch_market_news(_self, days=45):
186
+ """
187
+ Fetches historical market news by looping through days.
188
+ Uses 'SPY' as a proxy to allow historical date filtering on Finnhub.
189
+ """
190
+ print(f"πŸ“° Fetching last {days} days of market headlines...")
191
+
192
+ all_news = []
193
+ end_date = datetime.now()
194
+
195
+ # Try to render a Streamlit progress bar if running inside app.py
196
+ try:
197
+ progress_bar = st.progress(0, text="Fetching historical news data (avoiding rate limits)...")
198
+ except:
199
+ progress_bar = None
200
+
201
+ # Loop backwards through time, day by day
202
+ for i in range(days):
203
+ target_date = end_date - timedelta(days=i)
204
+ date_str = target_date.strftime('%Y-%m-%d')
205
+
206
+ try:
207
+ # FINNHUB TRICK: Use 'SPY' company news to get historical market coverage
208
+ daily_news = _self.finnhub_client.company_news('SPY', _from=date_str, to=date_str)
209
+
210
+ if daily_news:
211
+ all_news.extend(daily_news)
212
+
213
+ # πŸ›‘ RATE LIMIT SHIELD: Finnhub free tier allows 60 requests/minute.
214
+ # Sleeping for 1.1 seconds guarantees we stay perfectly under the limit.
215
+ time.sleep(1.1)
216
+
217
+ except Exception as e:
218
+ print(f"⚠️ API Error on {date_str}: {e}")
219
+ time.sleep(5) # Take a longer pause if the API gets angry
220
+
221
+ # Update UI progress
222
+ if progress_bar:
223
+ progress_bar.progress((i + 1) / days, text=f"Fetched news for {date_str}...")
224
+
225
+ # Clear the progress bar when finished
226
+ if progress_bar:
227
+ progress_bar.empty()
228
+
229
+ # Convert the master list into a DataFrame
230
+ df_news = pd.DataFrame(all_news)
231
+
232
+ if df_news.empty:
233
+ print("⚠️ No news found in the specified window.")
234
+ return pd.DataFrame(columns=['Title', 'Date'])
235
+
236
+ # Convert Unix timestamp to YYYY-MM-DD Date object
237
+ df_news['Date'] = pd.to_datetime(df_news['datetime'], unit='s').dt.date
238
+
239
+ # Rename columns to match what Processor expects
240
+ df_news = df_news[['headline', 'Date']].rename(columns={'headline': 'Title'})
241
+
242
+ # Drop duplicates in case of overlapping API returns
243
+ df_news = df_news.drop_duplicates(subset=['Title', 'Date'])
244
+
245
+ print(f"βœ… Successfully fetched {len(df_news)} historical headlines.")
246
+ return df_news
247
+
248
+ if __name__ == "__main__":
249
+ fetcher = DataFetcher()
250
+
251
+ # Test Market Fetch
252
+ market_df = fetcher.fetch_market_data(days=50)
253
+ print("\n--- Market Data Sample ---")
254
+ print(market_df.tail())
255
+
256
+ # Test News Fetch
257
+ news_df = fetcher.fetch_market_news(days=45)
258
+ print("\n--- Market News Sample ---")
259
+ print(news_df.head())
260
+ print(news_df.tail())
261
+ print(f"\nTotal Headlines Fetched: {len(news_df)}")