P2-ETF-TREND-SUITE / scripts /seed_dataset.py
github-actions
Sync from GitHub
59b7143
#!/usr/bin/env python3
"""
Rebuild ETF dataset from scratch and upload to HuggingFace.
Triggered manually via GitHub Actions.
"""
import sys
import os
import logging
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Add parent directory to path
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
def main():
from data.loader import seed_dataset_from_scratch, get_safe_token, FI_TICKERS, X_EQUITY_TICKERS
logger.info("=" * 60)
logger.info("ETF DATASET REBUILD - STARTED")
logger.info("=" * 60)
# Check token
logger.info("Checking HF_TOKEN...")
token = get_safe_token()
if not token:
logger.error("❌ HF_TOKEN not found!")
logger.error("Set HF_TOKEN secret in GitHub repository settings")
sys.exit(1)
logger.info("βœ… HF_TOKEN found")
# Show configuration
all_tickers = sorted(set(X_EQUITY_TICKERS + FI_TICKERS + ["SPY", "AGG"]))
logger.info(f"Equity tickers: {len(X_EQUITY_TICKERS)}")
logger.info(f"Fixed Income tickers: {len(FI_TICKERS)}")
logger.info(f"Total unique tickers: {len(all_tickers)}")
logger.info(f"FI Tickers: {FI_TICKERS}")
# Check for TBT
if 'TBT' in FI_TICKERS:
logger.warning("⚠️ TBT is still in FI_TICKERS - did you update loader.py?")
else:
logger.info("βœ… TBT not in ticker list (as expected)")
# Check for required tickers
required = ['VCIT', 'LQD', 'HYG']
missing_required = [t for t in required if t not in FI_TICKERS]
if missing_required:
logger.error(f"❌ Missing required tickers: {missing_required}")
sys.exit(1)
logger.info(f"βœ… All required tickers present: {required}")
# Build dataset
logger.info("-" * 60)
logger.info("πŸš€ Downloading data from Yahoo Finance (2008-present)...")
logger.info("⏳ This will take 3-5 minutes...")
try:
df = seed_dataset_from_scratch()
logger.info("-" * 60)
logger.info(f"βœ… SUCCESS! Dataset rebuilt")
logger.info(f"πŸ“Š Shape: {df.shape[0]} rows Γ— {df.shape[1]} columns")
logger.info(f"πŸ“… Date range: {df.index.min().date()} to {df.index.max().date()}")
# Verify columns
price_cols = [c for c in df.columns if c not in ['SOFR_ANNUAL']]
logger.info(f"πŸ“ˆ Price columns: {len(price_cols)}")
# Final verification
if 'TBT' in df.columns:
logger.warning("⚠️ TBT column still exists in data!")
else:
logger.info("βœ… TBT successfully excluded from dataset")
logger.info("=" * 60)
logger.info("UPLOAD TO HUGGINGFACE COMPLETE")
logger.info("Dataset URL: https://huggingface.co/datasets/P2SAMAPA/etf_trend_data")
logger.info("=" * 60)
except Exception as e:
logger.error(f"❌ FAILED: {e}")
import traceback
logger.error(traceback.format_exc())
sys.exit(1)
if __name__ == "__main__":
main()