Spaces:
Sleeping
Sleeping
Khiry McCurn commited on
Update update_data.py
Browse files- update_data.py +52 -59
update_data.py
CHANGED
|
@@ -5,6 +5,7 @@ This script is designed to run as a scheduled job at end of trading day
|
|
| 5 |
|
| 6 |
Features:
|
| 7 |
- Batched downloads to avoid rate limiting
|
|
|
|
| 8 |
- Delays between batches
|
| 9 |
- Retry logic for failed downloads
|
| 10 |
- Supports ~600 tickers (S&P 500 + Nasdaq 100 + recent IPOs + ETFs)
|
|
@@ -27,29 +28,29 @@ TICKERS = [
|
|
| 27 |
# S&P 500 Components
|
| 28 |
'A', 'AAL', 'AAPL', 'ABBV', 'ABNB', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI', 'ADM', 'ADP', 'ADSK', 'AEE', 'AEP',
|
| 29 |
'AES', 'AFL', 'AIG', 'AIZ', 'AJG', 'AKAM', 'ALB', 'ALGN', 'ALL', 'ALLE', 'AMAT', 'AMCR', 'AMD', 'AME', 'AMGN',
|
| 30 |
-
'AMP', 'AMT', 'AMZN', 'ANET', '
|
| 31 |
'AVY', 'AWK', 'AXON', 'AXP', 'AZO', 'BA', 'BAC', 'BALL', 'BAX', 'BBWI', 'BBY', 'BDX', 'BEN', 'BF-B', 'BG',
|
| 32 |
'BIIB', 'BIO', 'BK', 'BKNG', 'BKR', 'BLDR', 'BLK', 'BMY', 'BR', 'BRK-B', 'BRO', 'BSX', 'BWA', 'BX', 'BXP',
|
| 33 |
'C', 'CAG', 'CAH', 'CARR', 'CAT', 'CB', 'CBOE', 'CBRE', 'CCI', 'CCL', 'CDNS', 'CDW', 'CE', 'CEG', 'CF',
|
| 34 |
'CFG', 'CHD', 'CHRW', 'CHTR', 'CI', 'CINF', 'CL', 'CLX', 'CMCSA', 'CME', 'CMG', 'CMI', 'CMS', 'CNC', 'CNP',
|
| 35 |
'COF', 'COO', 'COP', 'COR', 'COST', 'CPAY', 'CPB', 'CPRT', 'CPT', 'CRL', 'CRM', 'CRWD', 'CSCO', 'CSGP', 'CSX',
|
| 36 |
-
'CTAS', '
|
| 37 |
'DG', 'DGX', 'DHI', 'DHR', 'DIS', 'DLR', 'DLTR', 'DOC', 'DOV', 'DOW', 'DPZ', 'DRI', 'DTE', 'DUK', 'DVA',
|
| 38 |
'DVN', 'DXCM', 'EA', 'EBAY', 'ECL', 'ED', 'EFX', 'EG', 'EIX', 'EL', 'ELV', 'EMN', 'EMR', 'ENPH', 'EOG',
|
| 39 |
'EPAM', 'EQIX', 'EQR', 'EQT', 'ERIE', 'ES', 'ESS', 'ETN', 'ETR', 'EVRG', 'EW', 'EXC', 'EXPD', 'EXPE', 'EXR',
|
| 40 |
'F', 'FANG', 'FAST', 'FCX', 'FDS', 'FDX', 'FE', 'FFIV', 'FI', 'FICO', 'FIS', 'FITB', 'FMC', 'FOX', 'FOXA',
|
| 41 |
'FRT', 'FSLR', 'FTNT', 'FTV', 'GD', 'GDDY', 'GE', 'GEHC', 'GEN', 'GEV', 'GILD', 'GIS', 'GL', 'GLW', 'GM',
|
| 42 |
-
'GNRC', 'GOOG', 'GOOGL', 'GPC', 'GPN', 'GRMN', 'GS', 'GWW', 'HAL', 'HAS', 'HBAN', 'HCA', 'HD', '
|
| 43 |
'HII', 'HLT', 'HOLX', 'HON', 'HPE', 'HPQ', 'HRL', 'HSIC', 'HST', 'HSY', 'HUBB', 'HUM', 'HWM', 'IBM', 'ICE',
|
| 44 |
'IDXX', 'IEX', 'IFF', 'INCY', 'INTC', 'INTU', 'INVH', 'IP', 'IPG', 'IQV', 'IR', 'IRM', 'ISRG', 'IT', 'ITW',
|
| 45 |
-
'IVZ', 'J', 'JBHT', 'JBL', 'JCI', 'JKHY', 'JNJ', '
|
| 46 |
'KKR', 'KLAC', 'KMB', 'KMI', 'KMX', 'KO', 'KR', 'KVUE', 'L', 'LDOS', 'LEN', 'LH', 'LHX', 'LIN', 'LKQ',
|
| 47 |
'LLY', 'LMT', 'LNT', 'LOW', 'LRCX', 'LULU', 'LUV', 'LVS', 'LW', 'LYB', 'LYV', 'MA', 'MAA', 'MAR', 'MAS',
|
| 48 |
'MCD', 'MCHP', 'MCK', 'MCO', 'MDLZ', 'MDT', 'MET', 'META', 'MGM', 'MHK', 'MKC', 'MKTX', 'MLM', 'MMC', 'MMM',
|
| 49 |
-
'MNST', 'MO', 'MOH', 'MOS', 'MPC', 'MPWR', 'MRK', 'MRNA', '
|
| 50 |
'MTD', 'MU', 'NCLH', 'NDAQ', 'NDSN', 'NEE', 'NEM', 'NFLX', 'NI', 'NKE', 'NOC', 'NOW', 'NRG', 'NSC', 'NTAP',
|
| 51 |
'NTRS', 'NUE', 'NVDA', 'NVR', 'NWS', 'NWSA', 'NXPI', 'O', 'ODFL', 'OKE', 'OMC', 'ON', 'ORCL', 'ORLY', 'OTIS',
|
| 52 |
-
'OXY', 'PANW', '
|
| 53 |
'PKG', 'PLD', 'PLTR', 'PM', 'PNC', 'PNR', 'PNW', 'PODD', 'POOL', 'PPG', 'PPL', 'PRU', 'PSA', 'PSX', 'PTC',
|
| 54 |
'PWR', 'PYPL', 'QCOM', 'QRVO', 'RCL', 'REG', 'REGN', 'RF', 'RJF', 'RL', 'RMD', 'ROK', 'ROL', 'ROP', 'ROST',
|
| 55 |
'RSG', 'RTX', 'RVTY', 'SBAC', 'SBUX', 'SCHW', 'SHW', 'SJM', 'SLB', 'SMCI', 'SNA', 'SNPS', 'SO', 'SOLV', 'SPG',
|
|
@@ -57,11 +58,11 @@ TICKERS = [
|
|
| 57 |
'TDY', 'TECH', 'TEL', 'TER', 'TFC', 'TFX', 'TGT', 'TJX', 'TMO', 'TMUS', 'TPR', 'TRGP', 'TRMB', 'TROW', 'TRV',
|
| 58 |
'TSCO', 'TSLA', 'TSN', 'TT', 'TTWO', 'TXN', 'TXT', 'TYL', 'UAL', 'UBER', 'UDR', 'UHS', 'ULTA', 'UNH', 'UNP',
|
| 59 |
'UPS', 'URI', 'USB', 'V', 'VICI', 'VLO', 'VLTO', 'VMC', 'VRSK', 'VRSN', 'VRTX', 'VST', 'VTR', 'VTRS', 'VZ',
|
| 60 |
-
'WAB', 'WAT', '
|
| 61 |
'WYNN', 'XEL', 'XOM', 'XYL', 'YUM', 'ZBH', 'ZBRA', 'ZTS',
|
| 62 |
|
| 63 |
# Nasdaq 100 additions (not in S&P 500)
|
| 64 |
-
'AZN', 'APP', 'ARM', 'CCEP', 'DASH', 'DDOG', 'GFS', 'MELI', '
|
| 65 |
|
| 66 |
# ETFs and Benchmarks
|
| 67 |
'SPY', 'QQQ', 'VTI', 'VOO', 'VXUS', 'DIA', 'IWM', 'TQQQ', 'SQQQ', 'BND', 'TLT', 'IEF', 'GLD', 'DBC', 'VNQ',
|
|
@@ -70,7 +71,7 @@ TICKERS = [
|
|
| 70 |
|
| 71 |
# Notable 2020-2021 IPOs
|
| 72 |
'RIVN', 'LCID', 'RBLX', 'COIN', 'HOOD', 'SNOW', 'U', 'CPNG', 'COUR', 'OSCR', 'SOFI', 'UPST', 'AFRM', 'PATH',
|
| 73 |
-
'
|
| 74 |
'DKNG', 'NKLA', 'BLNK', 'QS', 'GOEV', 'LAZR', 'LMND', 'OPEN',
|
| 75 |
|
| 76 |
# 2022 IPOs (notable)
|
|
@@ -98,20 +99,12 @@ DATA_DIR = "data"
|
|
| 98 |
BATCH_SIZE = 50 # Download 50 tickers at a time
|
| 99 |
BATCH_DELAY = 5 # Wait 5 seconds between batches
|
| 100 |
MAX_RETRIES = 2 # Retry failed downloads up to 2 times
|
|
|
|
| 101 |
|
| 102 |
|
| 103 |
def download_ticker_data(ticker, start_date, end_date, retry_count=0):
|
| 104 |
"""
|
| 105 |
Download historical data for a single ticker from Yahoo Finance
|
| 106 |
-
|
| 107 |
-
Args:
|
| 108 |
-
ticker: Stock ticker symbol
|
| 109 |
-
start_date: Start date for historical data
|
| 110 |
-
end_date: End date for historical data
|
| 111 |
-
retry_count: Current retry attempt number
|
| 112 |
-
|
| 113 |
-
Returns:
|
| 114 |
-
DataFrame with OHLCV data or None if download fails
|
| 115 |
"""
|
| 116 |
try:
|
| 117 |
data = yf.download(
|
|
@@ -130,7 +123,7 @@ def download_ticker_data(ticker, start_date, end_date, retry_count=0):
|
|
| 130 |
|
| 131 |
except Exception as e:
|
| 132 |
if retry_count < MAX_RETRIES:
|
| 133 |
-
time.sleep(2)
|
| 134 |
return download_ticker_data(ticker, start_date, end_date, retry_count + 1)
|
| 135 |
print(f"❌ Error downloading {ticker} after {MAX_RETRIES} retries: {e}")
|
| 136 |
return None
|
|
@@ -139,16 +132,6 @@ def download_ticker_data(ticker, start_date, end_date, retry_count=0):
|
|
| 139 |
def download_batch(tickers_batch, start_date, end_date, batch_num, total_batches):
|
| 140 |
"""
|
| 141 |
Download a batch of tickers
|
| 142 |
-
|
| 143 |
-
Args:
|
| 144 |
-
tickers_batch: List of tickers to download
|
| 145 |
-
start_date: Start date
|
| 146 |
-
end_date: End date
|
| 147 |
-
batch_num: Current batch number
|
| 148 |
-
total_batches: Total number of batches
|
| 149 |
-
|
| 150 |
-
Returns:
|
| 151 |
-
Dict of {ticker: DataFrame} for successful downloads
|
| 152 |
"""
|
| 153 |
print(f"\n📦 Batch {batch_num}/{total_batches} ({len(tickers_batch)} tickers)")
|
| 154 |
print(f" Tickers: {', '.join(tickers_batch[:10])}{'...' if len(tickers_batch) > 10 else ''}")
|
|
@@ -169,11 +152,7 @@ def download_batch(tickers_batch, start_date, end_date, batch_num, total_batches
|
|
| 169 |
def update_all_tickers():
|
| 170 |
"""
|
| 171 |
Update CSV files for all tickers with batching and rate limiting
|
| 172 |
-
|
| 173 |
-
Returns:
|
| 174 |
-
Tuple of (success_count, error_count)
|
| 175 |
"""
|
| 176 |
-
# Calculate date range (5 years of history)
|
| 177 |
end_date = datetime.now()
|
| 178 |
start_date = end_date - timedelta(days=5*365)
|
| 179 |
|
|
@@ -186,21 +165,17 @@ def update_all_tickers():
|
|
| 186 |
print(f"⏱️ Delay between batches: {BATCH_DELAY}s")
|
| 187 |
print(f"{'='*70}")
|
| 188 |
|
| 189 |
-
# Create data directory
|
| 190 |
os.makedirs(DATA_DIR, exist_ok=True)
|
| 191 |
|
| 192 |
-
# Split tickers into batches
|
| 193 |
batches = [TICKERS[i:i + BATCH_SIZE] for i in range(0, len(TICKERS), BATCH_SIZE)]
|
| 194 |
total_batches = len(batches)
|
| 195 |
|
| 196 |
success_count = 0
|
| 197 |
error_count = 0
|
| 198 |
|
| 199 |
-
# Process each batch
|
| 200 |
for batch_num, batch in enumerate(batches, 1):
|
| 201 |
results = download_batch(batch, start_date, end_date, batch_num, total_batches)
|
| 202 |
|
| 203 |
-
# Save successful downloads
|
| 204 |
for ticker, data in results.items():
|
| 205 |
filepath = os.path.join(DATA_DIR, f"{ticker}.csv")
|
| 206 |
data.to_csv(filepath)
|
|
@@ -208,7 +183,6 @@ def update_all_tickers():
|
|
| 208 |
|
| 209 |
error_count += len(batch) - len(results)
|
| 210 |
|
| 211 |
-
# Delay between batches (except for last batch)
|
| 212 |
if batch_num < total_batches:
|
| 213 |
print(f" ⏳ Waiting {BATCH_DELAY}s before next batch...")
|
| 214 |
time.sleep(BATCH_DELAY)
|
|
@@ -234,31 +208,54 @@ def update_all_tickers():
|
|
| 234 |
|
| 235 |
def push_to_huggingface(token):
|
| 236 |
"""
|
| 237 |
-
Push updated CSV files to Hugging Face repository
|
| 238 |
-
|
| 239 |
-
Args:
|
| 240 |
-
token: Hugging Face API token with write access
|
| 241 |
"""
|
| 242 |
try:
|
| 243 |
print(f"\n{'='*70}")
|
| 244 |
-
print("🚀 PUSHING TO HUGGING FACE")
|
| 245 |
print(f"{'='*70}\n")
|
| 246 |
|
| 247 |
-
# Login to Hugging Face
|
| 248 |
login(token=token)
|
| 249 |
api = HfApi()
|
| 250 |
|
| 251 |
-
#
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
-
print(f"✅ Successfully pushed to Hugging Face!")
|
| 262 |
print(f"{'='*70}\n")
|
| 263 |
return True
|
| 264 |
|
|
@@ -269,22 +266,18 @@ def push_to_huggingface(token):
|
|
| 269 |
|
| 270 |
def main():
|
| 271 |
"""Main execution function"""
|
| 272 |
-
# Get HF token from environment variable
|
| 273 |
hf_token = os.environ.get("HF_TOKEN")
|
| 274 |
|
| 275 |
if not hf_token:
|
| 276 |
print("❌ ERROR: HF_TOKEN environment variable not set!")
|
| 277 |
-
print("Please set your Hugging Face write token as HF_TOKEN")
|
| 278 |
sys.exit(1)
|
| 279 |
|
| 280 |
print(f"\n🎯 Horizon Backtester - Bulk Data Update")
|
| 281 |
print(f" Universe: {len(TICKERS)} tickers")
|
| 282 |
-
print(f" Estimated time: ~{(len(TICKERS) // BATCH_SIZE) * BATCH_DELAY // 60 +
|
| 283 |
|
| 284 |
-
# Update all ticker data
|
| 285 |
success_count, error_count = update_all_tickers()
|
| 286 |
|
| 287 |
-
# Push to Hugging Face if we had any successful downloads
|
| 288 |
if success_count > 0:
|
| 289 |
push_success = push_to_huggingface(hf_token)
|
| 290 |
if push_success:
|
|
|
|
| 5 |
|
| 6 |
Features:
|
| 7 |
- Batched downloads to avoid rate limiting
|
| 8 |
+
- Batched uploads to avoid timeout
|
| 9 |
- Delays between batches
|
| 10 |
- Retry logic for failed downloads
|
| 11 |
- Supports ~600 tickers (S&P 500 + Nasdaq 100 + recent IPOs + ETFs)
|
|
|
|
| 28 |
# S&P 500 Components
|
| 29 |
'A', 'AAL', 'AAPL', 'ABBV', 'ABNB', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI', 'ADM', 'ADP', 'ADSK', 'AEE', 'AEP',
|
| 30 |
'AES', 'AFL', 'AIG', 'AIZ', 'AJG', 'AKAM', 'ALB', 'ALGN', 'ALL', 'ALLE', 'AMAT', 'AMCR', 'AMD', 'AME', 'AMGN',
|
| 31 |
+
'AMP', 'AMT', 'AMZN', 'ANET', 'AON', 'AOS', 'APA', 'APD', 'APH', 'APTV', 'ARE', 'ATO', 'AVB', 'AVGO',
|
| 32 |
'AVY', 'AWK', 'AXON', 'AXP', 'AZO', 'BA', 'BAC', 'BALL', 'BAX', 'BBWI', 'BBY', 'BDX', 'BEN', 'BF-B', 'BG',
|
| 33 |
'BIIB', 'BIO', 'BK', 'BKNG', 'BKR', 'BLDR', 'BLK', 'BMY', 'BR', 'BRK-B', 'BRO', 'BSX', 'BWA', 'BX', 'BXP',
|
| 34 |
'C', 'CAG', 'CAH', 'CARR', 'CAT', 'CB', 'CBOE', 'CBRE', 'CCI', 'CCL', 'CDNS', 'CDW', 'CE', 'CEG', 'CF',
|
| 35 |
'CFG', 'CHD', 'CHRW', 'CHTR', 'CI', 'CINF', 'CL', 'CLX', 'CMCSA', 'CME', 'CMG', 'CMI', 'CMS', 'CNC', 'CNP',
|
| 36 |
'COF', 'COO', 'COP', 'COR', 'COST', 'CPAY', 'CPB', 'CPRT', 'CPT', 'CRL', 'CRM', 'CRWD', 'CSCO', 'CSGP', 'CSX',
|
| 37 |
+
'CTAS', 'CTRA', 'CTSH', 'CTVA', 'CVS', 'CVX', 'CZR', 'D', 'DAL', 'DAY', 'DD', 'DE', 'DECK',
|
| 38 |
'DG', 'DGX', 'DHI', 'DHR', 'DIS', 'DLR', 'DLTR', 'DOC', 'DOV', 'DOW', 'DPZ', 'DRI', 'DTE', 'DUK', 'DVA',
|
| 39 |
'DVN', 'DXCM', 'EA', 'EBAY', 'ECL', 'ED', 'EFX', 'EG', 'EIX', 'EL', 'ELV', 'EMN', 'EMR', 'ENPH', 'EOG',
|
| 40 |
'EPAM', 'EQIX', 'EQR', 'EQT', 'ERIE', 'ES', 'ESS', 'ETN', 'ETR', 'EVRG', 'EW', 'EXC', 'EXPD', 'EXPE', 'EXR',
|
| 41 |
'F', 'FANG', 'FAST', 'FCX', 'FDS', 'FDX', 'FE', 'FFIV', 'FI', 'FICO', 'FIS', 'FITB', 'FMC', 'FOX', 'FOXA',
|
| 42 |
'FRT', 'FSLR', 'FTNT', 'FTV', 'GD', 'GDDY', 'GE', 'GEHC', 'GEN', 'GEV', 'GILD', 'GIS', 'GL', 'GLW', 'GM',
|
| 43 |
+
'GNRC', 'GOOG', 'GOOGL', 'GPC', 'GPN', 'GRMN', 'GS', 'GWW', 'HAL', 'HAS', 'HBAN', 'HCA', 'HD', 'HIG',
|
| 44 |
'HII', 'HLT', 'HOLX', 'HON', 'HPE', 'HPQ', 'HRL', 'HSIC', 'HST', 'HSY', 'HUBB', 'HUM', 'HWM', 'IBM', 'ICE',
|
| 45 |
'IDXX', 'IEX', 'IFF', 'INCY', 'INTC', 'INTU', 'INVH', 'IP', 'IPG', 'IQV', 'IR', 'IRM', 'ISRG', 'IT', 'ITW',
|
| 46 |
+
'IVZ', 'J', 'JBHT', 'JBL', 'JCI', 'JKHY', 'JNJ', 'JPM', 'K', 'KDP', 'KEY', 'KEYS', 'KHC', 'KIM',
|
| 47 |
'KKR', 'KLAC', 'KMB', 'KMI', 'KMX', 'KO', 'KR', 'KVUE', 'L', 'LDOS', 'LEN', 'LH', 'LHX', 'LIN', 'LKQ',
|
| 48 |
'LLY', 'LMT', 'LNT', 'LOW', 'LRCX', 'LULU', 'LUV', 'LVS', 'LW', 'LYB', 'LYV', 'MA', 'MAA', 'MAR', 'MAS',
|
| 49 |
'MCD', 'MCHP', 'MCK', 'MCO', 'MDLZ', 'MDT', 'MET', 'META', 'MGM', 'MHK', 'MKC', 'MKTX', 'MLM', 'MMC', 'MMM',
|
| 50 |
+
'MNST', 'MO', 'MOH', 'MOS', 'MPC', 'MPWR', 'MRK', 'MRNA', 'MRVL', 'MS', 'MSCI', 'MSFT', 'MSI', 'MTB', 'MTCH',
|
| 51 |
'MTD', 'MU', 'NCLH', 'NDAQ', 'NDSN', 'NEE', 'NEM', 'NFLX', 'NI', 'NKE', 'NOC', 'NOW', 'NRG', 'NSC', 'NTAP',
|
| 52 |
'NTRS', 'NUE', 'NVDA', 'NVR', 'NWS', 'NWSA', 'NXPI', 'O', 'ODFL', 'OKE', 'OMC', 'ON', 'ORCL', 'ORLY', 'OTIS',
|
| 53 |
+
'OXY', 'PANW', 'PAYC', 'PAYX', 'PCAR', 'PCG', 'PEG', 'PEP', 'PFE', 'PFG', 'PG', 'PGR', 'PH', 'PHM',
|
| 54 |
'PKG', 'PLD', 'PLTR', 'PM', 'PNC', 'PNR', 'PNW', 'PODD', 'POOL', 'PPG', 'PPL', 'PRU', 'PSA', 'PSX', 'PTC',
|
| 55 |
'PWR', 'PYPL', 'QCOM', 'QRVO', 'RCL', 'REG', 'REGN', 'RF', 'RJF', 'RL', 'RMD', 'ROK', 'ROL', 'ROP', 'ROST',
|
| 56 |
'RSG', 'RTX', 'RVTY', 'SBAC', 'SBUX', 'SCHW', 'SHW', 'SJM', 'SLB', 'SMCI', 'SNA', 'SNPS', 'SO', 'SOLV', 'SPG',
|
|
|
|
| 58 |
'TDY', 'TECH', 'TEL', 'TER', 'TFC', 'TFX', 'TGT', 'TJX', 'TMO', 'TMUS', 'TPR', 'TRGP', 'TRMB', 'TROW', 'TRV',
|
| 59 |
'TSCO', 'TSLA', 'TSN', 'TT', 'TTWO', 'TXN', 'TXT', 'TYL', 'UAL', 'UBER', 'UDR', 'UHS', 'ULTA', 'UNH', 'UNP',
|
| 60 |
'UPS', 'URI', 'USB', 'V', 'VICI', 'VLO', 'VLTO', 'VMC', 'VRSK', 'VRSN', 'VRTX', 'VST', 'VTR', 'VTRS', 'VZ',
|
| 61 |
+
'WAB', 'WAT', 'WBD', 'WDC', 'WEC', 'WELL', 'WFC', 'WM', 'WMB', 'WMT', 'WRB', 'WST', 'WTW', 'WY',
|
| 62 |
'WYNN', 'XEL', 'XOM', 'XYL', 'YUM', 'ZBH', 'ZBRA', 'ZTS',
|
| 63 |
|
| 64 |
# Nasdaq 100 additions (not in S&P 500)
|
| 65 |
+
'AZN', 'APP', 'ARM', 'CCEP', 'DASH', 'DDOG', 'GFS', 'MELI', 'PDD', 'TEAM', 'WDAY',
|
| 66 |
|
| 67 |
# ETFs and Benchmarks
|
| 68 |
'SPY', 'QQQ', 'VTI', 'VOO', 'VXUS', 'DIA', 'IWM', 'TQQQ', 'SQQQ', 'BND', 'TLT', 'IEF', 'GLD', 'DBC', 'VNQ',
|
|
|
|
| 71 |
|
| 72 |
# Notable 2020-2021 IPOs
|
| 73 |
'RIVN', 'LCID', 'RBLX', 'COIN', 'HOOD', 'SNOW', 'U', 'CPNG', 'COUR', 'OSCR', 'SOFI', 'UPST', 'AFRM', 'PATH',
|
| 74 |
+
'BROS', 'DUOL', 'ASAN', 'FVRR', 'DOCS', 'DNUT', 'YOU', 'AI', 'DLO', 'JAMF', 'NCNO', 'JMIA',
|
| 75 |
'DKNG', 'NKLA', 'BLNK', 'QS', 'GOEV', 'LAZR', 'LMND', 'OPEN',
|
| 76 |
|
| 77 |
# 2022 IPOs (notable)
|
|
|
|
| 99 |
BATCH_SIZE = 50 # Download 50 tickers at a time
|
| 100 |
BATCH_DELAY = 5 # Wait 5 seconds between batches
|
| 101 |
MAX_RETRIES = 2 # Retry failed downloads up to 2 times
|
| 102 |
+
UPLOAD_BATCH_SIZE = 100 # Upload 100 files at a time
|
| 103 |
|
| 104 |
|
| 105 |
def download_ticker_data(ticker, start_date, end_date, retry_count=0):
|
| 106 |
"""
|
| 107 |
Download historical data for a single ticker from Yahoo Finance
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
"""
|
| 109 |
try:
|
| 110 |
data = yf.download(
|
|
|
|
| 123 |
|
| 124 |
except Exception as e:
|
| 125 |
if retry_count < MAX_RETRIES:
|
| 126 |
+
time.sleep(2)
|
| 127 |
return download_ticker_data(ticker, start_date, end_date, retry_count + 1)
|
| 128 |
print(f"❌ Error downloading {ticker} after {MAX_RETRIES} retries: {e}")
|
| 129 |
return None
|
|
|
|
| 132 |
def download_batch(tickers_batch, start_date, end_date, batch_num, total_batches):
|
| 133 |
"""
|
| 134 |
Download a batch of tickers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
"""
|
| 136 |
print(f"\n📦 Batch {batch_num}/{total_batches} ({len(tickers_batch)} tickers)")
|
| 137 |
print(f" Tickers: {', '.join(tickers_batch[:10])}{'...' if len(tickers_batch) > 10 else ''}")
|
|
|
|
| 152 |
def update_all_tickers():
|
| 153 |
"""
|
| 154 |
Update CSV files for all tickers with batching and rate limiting
|
|
|
|
|
|
|
|
|
|
| 155 |
"""
|
|
|
|
| 156 |
end_date = datetime.now()
|
| 157 |
start_date = end_date - timedelta(days=5*365)
|
| 158 |
|
|
|
|
| 165 |
print(f"⏱️ Delay between batches: {BATCH_DELAY}s")
|
| 166 |
print(f"{'='*70}")
|
| 167 |
|
|
|
|
| 168 |
os.makedirs(DATA_DIR, exist_ok=True)
|
| 169 |
|
|
|
|
| 170 |
batches = [TICKERS[i:i + BATCH_SIZE] for i in range(0, len(TICKERS), BATCH_SIZE)]
|
| 171 |
total_batches = len(batches)
|
| 172 |
|
| 173 |
success_count = 0
|
| 174 |
error_count = 0
|
| 175 |
|
|
|
|
| 176 |
for batch_num, batch in enumerate(batches, 1):
|
| 177 |
results = download_batch(batch, start_date, end_date, batch_num, total_batches)
|
| 178 |
|
|
|
|
| 179 |
for ticker, data in results.items():
|
| 180 |
filepath = os.path.join(DATA_DIR, f"{ticker}.csv")
|
| 181 |
data.to_csv(filepath)
|
|
|
|
| 183 |
|
| 184 |
error_count += len(batch) - len(results)
|
| 185 |
|
|
|
|
| 186 |
if batch_num < total_batches:
|
| 187 |
print(f" ⏳ Waiting {BATCH_DELAY}s before next batch...")
|
| 188 |
time.sleep(BATCH_DELAY)
|
|
|
|
| 208 |
|
| 209 |
def push_to_huggingface(token):
|
| 210 |
"""
|
| 211 |
+
Push updated CSV files to Hugging Face repository in batches
|
|
|
|
|
|
|
|
|
|
| 212 |
"""
|
| 213 |
try:
|
| 214 |
print(f"\n{'='*70}")
|
| 215 |
+
print("🚀 PUSHING TO HUGGING FACE (batched uploads)")
|
| 216 |
print(f"{'='*70}\n")
|
| 217 |
|
|
|
|
| 218 |
login(token=token)
|
| 219 |
api = HfApi()
|
| 220 |
|
| 221 |
+
# Get list of all files to upload
|
| 222 |
+
all_files = []
|
| 223 |
+
for filename in os.listdir(DATA_DIR):
|
| 224 |
+
filepath = os.path.join(DATA_DIR, filename)
|
| 225 |
+
if os.path.isfile(filepath):
|
| 226 |
+
all_files.append((filepath, f"data/{filename}"))
|
| 227 |
+
|
| 228 |
+
print(f"📁 Total files to upload: {len(all_files)}")
|
| 229 |
+
|
| 230 |
+
# Upload in batches
|
| 231 |
+
total_batches = (len(all_files) + UPLOAD_BATCH_SIZE - 1) // UPLOAD_BATCH_SIZE
|
| 232 |
+
|
| 233 |
+
for batch_num in range(total_batches):
|
| 234 |
+
start_idx = batch_num * UPLOAD_BATCH_SIZE
|
| 235 |
+
end_idx = min(start_idx + UPLOAD_BATCH_SIZE, len(all_files))
|
| 236 |
+
batch = all_files[start_idx:end_idx]
|
| 237 |
+
|
| 238 |
+
print(f"\n📤 Upload batch {batch_num + 1}/{total_batches} ({len(batch)} files)")
|
| 239 |
+
|
| 240 |
+
for filepath, repo_path in batch:
|
| 241 |
+
try:
|
| 242 |
+
api.upload_file(
|
| 243 |
+
path_or_fileobj=filepath,
|
| 244 |
+
path_in_repo=repo_path,
|
| 245 |
+
repo_id=REPO_ID,
|
| 246 |
+
repo_type=REPO_TYPE,
|
| 247 |
+
commit_message=f"Update {os.path.basename(filepath)}"
|
| 248 |
+
)
|
| 249 |
+
except Exception as e:
|
| 250 |
+
print(f" ⚠️ Failed to upload {filepath}: {e}")
|
| 251 |
+
|
| 252 |
+
print(f" ✅ Batch {batch_num + 1} complete")
|
| 253 |
+
|
| 254 |
+
# Small delay between upload batches
|
| 255 |
+
if batch_num < total_batches - 1:
|
| 256 |
+
time.sleep(2)
|
| 257 |
|
| 258 |
+
print(f"\n✅ Successfully pushed to Hugging Face!")
|
| 259 |
print(f"{'='*70}\n")
|
| 260 |
return True
|
| 261 |
|
|
|
|
| 266 |
|
| 267 |
def main():
|
| 268 |
"""Main execution function"""
|
|
|
|
| 269 |
hf_token = os.environ.get("HF_TOKEN")
|
| 270 |
|
| 271 |
if not hf_token:
|
| 272 |
print("❌ ERROR: HF_TOKEN environment variable not set!")
|
|
|
|
| 273 |
sys.exit(1)
|
| 274 |
|
| 275 |
print(f"\n🎯 Horizon Backtester - Bulk Data Update")
|
| 276 |
print(f" Universe: {len(TICKERS)} tickers")
|
| 277 |
+
print(f" Estimated time: ~{(len(TICKERS) // BATCH_SIZE) * BATCH_DELAY // 60 + 10} minutes\n")
|
| 278 |
|
|
|
|
| 279 |
success_count, error_count = update_all_tickers()
|
| 280 |
|
|
|
|
| 281 |
if success_count > 0:
|
| 282 |
push_success = push_to_huggingface(hf_token)
|
| 283 |
if push_success:
|