Spaces:
Sleeping
Sleeping
| """ | |
| Abstract base class for all data sources. | |
| This module defines the interface that all data sources must implement, | |
| enabling easy swapping between free and premium data sources. | |
| Design Principles: | |
| 1. All sources implement the same interface | |
| 2. Sources are responsible for their own validation | |
| 3. Caching is handled at the source level | |
| 4. Sources return standardized pandas DataFrames | |
| """ | |
| from abc import ABC, abstractmethod | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Optional, Dict, Any, List | |
| import pandas as pd | |
| import hashlib | |
| import json | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class BaseDataSource(ABC): | |
| """ | |
| Abstract base class for all data sources. | |
| All data sources (FRED, CBOE, yfinance, OptionMetrics, etc.) should | |
| inherit from this class and implement the required methods. | |
| Attributes: | |
| name: Human-readable name of the data source | |
| cache_dir: Directory for caching downloaded data | |
| cache_enabled: Whether to use caching | |
| cache_expiry_days: Number of days before cache expires | |
| """ | |
| def __init__( | |
| self, | |
| name: str, | |
| cache_dir: Optional[Path] = None, | |
| cache_enabled: bool = True, | |
| cache_expiry_days: int = 1 | |
| ): | |
| self.name = name | |
| self.cache_dir = Path(cache_dir) if cache_dir else Path("data/raw") | |
| self.cache_enabled = cache_enabled | |
| self.cache_expiry_days = cache_expiry_days | |
| # Ensure cache directory exists | |
| self.cache_dir.mkdir(parents=True, exist_ok=True) | |
| logger.info(f"Initialized {self.name} data source") | |
| def fetch( | |
| self, | |
| start_date: datetime, | |
| end_date: datetime, | |
| **kwargs | |
| ) -> pd.DataFrame: | |
| """ | |
| Fetch data from the source. | |
| Args: | |
| start_date: Start date for data retrieval | |
| end_date: End date for data retrieval | |
| **kwargs: Additional source-specific parameters | |
| Returns: | |
| DataFrame with standardized columns and datetime index | |
| """ | |
| pass | |
| def validate(self, df: pd.DataFrame) -> bool: | |
| """ | |
| Validate the fetched data. | |
| Args: | |
| df: DataFrame to validate | |
| Returns: | |
| True if validation passes, raises exception otherwise | |
| """ | |
| pass | |
| def get_available_series(self) -> List[str]: | |
| """ | |
| Get list of available data series from this source. | |
| Returns: | |
| List of series identifiers | |
| """ | |
| pass | |
| def get_cache_key(self, **kwargs) -> str: | |
| """ | |
| Generate a unique cache key for the request. | |
| Args: | |
| **kwargs: Parameters that affect the data (dates, series, etc.) | |
| Returns: | |
| MD5 hash string as cache key | |
| """ | |
| params = { | |
| 'source': self.name, | |
| **{k: str(v) for k, v in kwargs.items()} | |
| } | |
| param_str = json.dumps(params, sort_keys=True) | |
| return hashlib.md5(param_str.encode()).hexdigest() | |
| def get_cache_path(self, cache_key: str) -> Path: | |
| """Get the file path for a cached item.""" | |
| return self.cache_dir / f"{self.name}_{cache_key}.parquet" | |
| def is_cache_valid(self, cache_path: Path) -> bool: | |
| """Check if cached file exists and is not expired.""" | |
| if not cache_path.exists(): | |
| return False | |
| # Check file age | |
| file_age = datetime.now().timestamp() - cache_path.stat().st_mtime | |
| max_age = self.cache_expiry_days * 24 * 60 * 60 | |
| return file_age < max_age | |
| def load_from_cache(self, cache_key: str) -> Optional[pd.DataFrame]: | |
| """ | |
| Load data from cache if available and valid. | |
| Args: | |
| cache_key: Unique identifier for the cached data | |
| Returns: | |
| DataFrame if cache hit, None otherwise | |
| """ | |
| if not self.cache_enabled: | |
| return None | |
| cache_path = self.get_cache_path(cache_key) | |
| if self.is_cache_valid(cache_path): | |
| logger.info(f"Cache hit for {self.name}: {cache_key[:8]}...") | |
| return pd.read_parquet(cache_path) | |
| return None | |
| def save_to_cache(self, df: pd.DataFrame, cache_key: str) -> None: | |
| """ | |
| Save data to cache. | |
| Args: | |
| df: DataFrame to cache | |
| cache_key: Unique identifier for the cached data | |
| """ | |
| if not self.cache_enabled: | |
| return | |
| cache_path = self.get_cache_path(cache_key) | |
| df.to_parquet(cache_path) | |
| logger.info(f"Cached {self.name} data: {cache_key[:8]}...") | |
| def fetch_with_cache( | |
| self, | |
| start_date: datetime, | |
| end_date: datetime, | |
| **kwargs | |
| ) -> pd.DataFrame: | |
| """ | |
| Fetch data with caching support. | |
| This is the main method that should be called by users. | |
| It handles cache lookup, fetching, validation, and cache storage. | |
| Args: | |
| start_date: Start date for data retrieval | |
| end_date: End date for data retrieval | |
| **kwargs: Additional source-specific parameters | |
| Returns: | |
| Validated DataFrame with requested data | |
| """ | |
| cache_key = self.get_cache_key( | |
| start_date=start_date.isoformat(), | |
| end_date=end_date.isoformat(), | |
| **kwargs | |
| ) | |
| # Try cache first | |
| cached_df = self.load_from_cache(cache_key) | |
| if cached_df is not None: | |
| return cached_df | |
| # Fetch fresh data | |
| logger.info(f"Fetching {self.name} data: {start_date} to {end_date}") | |
| df = self.fetch(start_date, end_date, **kwargs) | |
| # Validate | |
| self.validate(df) | |
| # Cache and return | |
| self.save_to_cache(df, cache_key) | |
| return df | |
| def clear_cache(self) -> int: | |
| """ | |
| Clear all cached files for this source. | |
| Returns: | |
| Number of files deleted | |
| """ | |
| count = 0 | |
| for cache_file in self.cache_dir.glob(f"{self.name}_*.parquet"): | |
| cache_file.unlink() | |
| count += 1 | |
| logger.info(f"Cleared {count} cached files for {self.name}") | |
| return count | |
| class DataSourceError(Exception): | |
| """Base exception for data source errors.""" | |
| pass | |
| class DataFetchError(DataSourceError): | |
| """Raised when data fetching fails.""" | |
| pass | |
| class DataValidationError(DataSourceError): | |
| """Raised when data validation fails.""" | |
| pass | |
| class CacheError(DataSourceError): | |
| """Raised when cache operations fail.""" | |
| pass | |