File size: 3,145 Bytes
59b7143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
"""
Rebuild ETF dataset from scratch and upload to HuggingFace.
Triggered manually via GitHub Actions.
"""

import sys
import os
import logging

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Add parent directory to path
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))

def main():
    from data.loader import seed_dataset_from_scratch, get_safe_token, FI_TICKERS, X_EQUITY_TICKERS
    
    logger.info("=" * 60)
    logger.info("ETF DATASET REBUILD - STARTED")
    logger.info("=" * 60)
    
    # Check token
    logger.info("Checking HF_TOKEN...")
    token = get_safe_token()
    if not token:
        logger.error("❌ HF_TOKEN not found!")
        logger.error("Set HF_TOKEN secret in GitHub repository settings")
        sys.exit(1)
    logger.info("βœ… HF_TOKEN found")
    
    # Show configuration
    all_tickers = sorted(set(X_EQUITY_TICKERS + FI_TICKERS + ["SPY", "AGG"]))
    logger.info(f"Equity tickers: {len(X_EQUITY_TICKERS)}")
    logger.info(f"Fixed Income tickers: {len(FI_TICKERS)}")
    logger.info(f"Total unique tickers: {len(all_tickers)}")
    logger.info(f"FI Tickers: {FI_TICKERS}")
    
    # Check for TBT
    if 'TBT' in FI_TICKERS:
        logger.warning("⚠️  TBT is still in FI_TICKERS - did you update loader.py?")
    else:
        logger.info("βœ… TBT not in ticker list (as expected)")
    
    # Check for required tickers
    required = ['VCIT', 'LQD', 'HYG']
    missing_required = [t for t in required if t not in FI_TICKERS]
    if missing_required:
        logger.error(f"❌ Missing required tickers: {missing_required}")
        sys.exit(1)
    logger.info(f"βœ… All required tickers present: {required}")
    
    # Build dataset
    logger.info("-" * 60)
    logger.info("πŸš€ Downloading data from Yahoo Finance (2008-present)...")
    logger.info("⏳ This will take 3-5 minutes...")
    
    try:
        df = seed_dataset_from_scratch()
        
        logger.info("-" * 60)
        logger.info(f"βœ… SUCCESS! Dataset rebuilt")
        logger.info(f"πŸ“Š Shape: {df.shape[0]} rows Γ— {df.shape[1]} columns")
        logger.info(f"πŸ“… Date range: {df.index.min().date()} to {df.index.max().date()}")
        
        # Verify columns
        price_cols = [c for c in df.columns if c not in ['SOFR_ANNUAL']]
        logger.info(f"πŸ“ˆ Price columns: {len(price_cols)}")
        
        # Final verification
        if 'TBT' in df.columns:
            logger.warning("⚠️  TBT column still exists in data!")
        else:
            logger.info("βœ… TBT successfully excluded from dataset")
            
        logger.info("=" * 60)
        logger.info("UPLOAD TO HUGGINGFACE COMPLETE")
        logger.info("Dataset URL: https://huggingface.co/datasets/P2SAMAPA/etf_trend_data")
        logger.info("=" * 60)
        
    except Exception as e:
        logger.error(f"❌ FAILED: {e}")
        import traceback
        logger.error(traceback.format_exc())
        sys.exit(1)

if __name__ == "__main__":
    main()