Edwin Salguero
Enhanced FRED ML with improved Reports & Insights page, fixed alignment analysis, and comprehensive analytics improvements
2469150
| #!/usr/bin/env python3 | |
| """ | |
| Data Validation Script | |
| Test the economic indicators and identify math issues | |
| """ | |
| import os | |
| import sys | |
| import pandas as pd | |
| import numpy as np | |
| from datetime import datetime | |
| # Add src to path | |
| sys.path.append(os.path.join(os.path.dirname(__file__), 'src')) | |
| from src.core.enhanced_fred_client import EnhancedFREDClient | |
| def test_data_validation(): | |
| """Test data validation and identify issues""" | |
| # Use a demo API key for testing (FRED allows limited access without key) | |
| api_key = "demo" # FRED demo key for testing | |
| print("=== ECONOMIC DATA VALIDATION TEST ===\n") | |
| try: | |
| # Initialize client | |
| client = EnhancedFREDClient(api_key) | |
| # Test indicators | |
| indicators = ['GDPC1', 'CPIAUCSL', 'INDPRO', 'RSAFS', 'FEDFUNDS', 'DGS10'] | |
| print("1. Testing data fetching...") | |
| data = client.fetch_economic_data( | |
| indicators=indicators, | |
| start_date='2020-01-01', | |
| end_date='2024-12-31', | |
| frequency='auto' | |
| ) | |
| print(f"Data shape: {data.shape}") | |
| print(f"Date range: {data.index.min()} to {data.index.max()}") | |
| print(f"Columns: {list(data.columns)}") | |
| print("\n2. Raw data sample (last 5 observations):") | |
| print(data.tail()) | |
| print("\n3. Data statistics:") | |
| print(data.describe()) | |
| print("\n4. Missing data analysis:") | |
| missing_data = data.isnull().sum() | |
| print(missing_data) | |
| print("\n5. Testing frequency standardization...") | |
| # Test the frequency standardization | |
| for indicator in indicators: | |
| if indicator in data.columns: | |
| series = data[indicator].dropna() | |
| print(f"{indicator}: {len(series)} observations, freq: {series.index.freq}") | |
| print("\n6. Testing growth rate calculation...") | |
| # Test growth rate calculation | |
| for indicator in indicators: | |
| if indicator in data.columns: | |
| series = data[indicator].dropna() | |
| if len(series) > 1: | |
| # Calculate percent change | |
| pct_change = series.pct_change().dropna() | |
| latest_change = pct_change.iloc[-1] * 100 if len(pct_change) > 0 else 0 | |
| print(f"{indicator}: Latest change = {latest_change:.2f}%") | |
| print(f" Raw values: {series.iloc[-2]:.2f} -> {series.iloc[-1]:.2f}") | |
| print("\n7. Testing unit normalization...") | |
| # Test unit normalization | |
| for indicator in indicators: | |
| if indicator in data.columns: | |
| series = data[indicator].dropna() | |
| if len(series) > 0: | |
| mean_val = series.mean() | |
| std_val = series.std() | |
| print(f"{indicator}: Mean={mean_val:.2f}, Std={std_val:.2f}") | |
| # Check for potential unit issues | |
| if mean_val > 1000000: # Likely in billions/trillions | |
| print(f" WARNING: {indicator} has very large values - may need unit conversion") | |
| elif mean_val < 1 and indicator in ['FEDFUNDS', 'DGS10']: | |
| print(f" WARNING: {indicator} has small values - may be in decimal form instead of percentage") | |
| print("\n8. Testing data quality validation...") | |
| quality_report = client.validate_data_quality(data) | |
| print("Quality report summary:") | |
| for series, metrics in quality_report['missing_data'].items(): | |
| print(f" {series}: {metrics['completeness']:.1f}% complete") | |
| print("\n9. Testing frequency alignment...") | |
| # Check if all series have the same frequency | |
| frequencies = {} | |
| for indicator in indicators: | |
| if indicator in data.columns: | |
| series = data[indicator].dropna() | |
| if len(series) > 0: | |
| freq = pd.infer_freq(series.index) | |
| frequencies[indicator] = freq | |
| print(f" {indicator}: {freq}") | |
| # Check for frequency mismatches | |
| unique_freqs = set(frequencies.values()) | |
| if len(unique_freqs) > 1: | |
| print(f" WARNING: Multiple frequencies detected: {unique_freqs}") | |
| print(" This may cause issues in modeling and forecasting") | |
| print("\n=== VALIDATION COMPLETE ===") | |
| # Summary of potential issues | |
| print("\n=== POTENTIAL ISSUES IDENTIFIED ===") | |
| issues = [] | |
| # Check for unit scale issues | |
| for indicator in indicators: | |
| if indicator in data.columns: | |
| series = data[indicator].dropna() | |
| if len(series) > 0: | |
| mean_val = series.mean() | |
| if mean_val > 1000000: | |
| issues.append(f"Unit scale issue: {indicator} has very large values ({mean_val:.0f})") | |
| elif mean_val < 1 and indicator in ['FEDFUNDS', 'DGS10']: | |
| issues.append(f"Unit format issue: {indicator} may be in decimal form instead of percentage") | |
| # Check for frequency issues | |
| if len(unique_freqs) > 1: | |
| issues.append(f"Frequency mismatch: Series have different frequencies {unique_freqs}") | |
| # Check for missing data | |
| for series, metrics in quality_report['missing_data'].items(): | |
| if metrics['missing_percentage'] > 10: | |
| issues.append(f"Missing data: {series} has {metrics['missing_percentage']:.1f}% missing values") | |
| if issues: | |
| for issue in issues: | |
| print(f" • {issue}") | |
| else: | |
| print(" No major issues detected") | |
| except Exception as e: | |
| print(f"Error during validation: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| if __name__ == "__main__": | |
| test_data_validation() |